### Student Information
Name: 楊叔晴

Student ID: 109071019

GitHub ID: shuching

Kaggle name: 
ShuChing Yang

Kaggle private scoreboard snapshot:

[Snapshot](img/pic0.png)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# import package
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pandas.io.json import json_normalize

## 1. Load data

In [None]:
data_identification = pd.read_csv("../input/dm2022-isa5810-lab2-homework/data_identification.csv")
emotion = pd.read_csv("../input/dm2022-isa5810-lab2-homework/emotion.csv")
sampleSubmission = pd.read_csv("../input/dm2022-isa5810-lab2-homework/sampleSubmission.csv")
raw_data = pd.read_json('../input/dm2022-isa5810-lab2-homework/tweets_DM.json',lines = True)

In [None]:
# check data
data_identification[:10]

In [None]:
# check emotion data
emotion[:10]

In [None]:
# check sampleSubmission data
sampleSubmission[:10]

In [None]:
# check raw_data data
raw_data[:10]

In [None]:
# split the _source columns
df = pd.json_normalize(data=raw_data['_source'])

# rename column names of source
df=df.rename(index=str,columns={"tweet.text":"text", "tweet.tweet_id":"tweet_id","tweet.hashtags":"hashtags"})

In [None]:
# check normalized and renamed raw data
df[:10]

In [None]:
# add identification the dataframe
df=pd.merge(df,data_identification, on="tweet_id")

## 2. Clean the text column

In [None]:
# clean the text
import re
from string import punctuation

def preprocess_text(text):
    text = text.lower()  #  lowercase text
    text = re.sub(f"[{re.escape(punctuation)}]", "", text)  # remove punctuation
    text = " ".join(text.split())  # remove extra spaces, tabs, and new lines
    return text

df['text'] = df['text'].map(preprocess_text)

In [None]:
# check the clean text
df[:10]

In [None]:
# split into train and test dataset
train_df = df[df['identification']=='train']
train_df = pd.merge(train_df, emotion, on='tweet_id')

test_df = df[df['identification']=='test']
test_df["emotion"]=""

In [None]:
# drop the identification and hashtags column
train_df = train_df.drop(['identification'], axis=1)
test_df = test_df.drop(['identification'], axis=1)
train_df = train_df.drop(['hashtags'], axis=1)
test_df = test_df.drop(['hashtags'], axis=1)

In [None]:
train_df[:10]

In [None]:
test_df[:10]

## 3. Save data in Pickle

In [None]:
# save to pickle file, for speed
train_df.to_pickle("train_df.pkl") 
test_df.to_pickle("test_df.pkl")

# load a pickle file
train_df = pd.read_pickle("train_df.pkl")
test_df = pd.read_pickle("test_df.pkl")

## 4. Exploratory Data Analysis (EDA)

In [None]:
#group to find distribution
train_df.groupby(['emotion']).count()['text']

In [None]:
# the histogram of the data
labels = train_df['emotion'].unique()
post_total = len(train_df)
df1 = train_df.groupby(['emotion']).count()['text']
df1 = df1.apply(lambda x: round(x*100/post_total,3))

#plot
fig, ax = plt.subplots(figsize=(10, 5))
plt.bar(df1.index,df1.values)

#arrange
plt.ylabel('% of instances')
plt.xlabel('Emotion')
plt.title('Emotion distribution')
plt.grid(True)
plt.show()

## 5. Feature Engineering

#### 5.1 Count Vectorizer & Tweet Tokenizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TweetTokenizer

# create a function for the tweet tokenizer from NLTK
def tknzr(text):
    tt = TweetTokenizer()
    return tt.tokenize(text)

count_vect = CountVectorizer(max_features=100000, tokenizer=tknzr).fit(train_df['text'])
bow_transformed = count_vect.transform(train_df['text'])

## 6. Model

In [None]:
from sklearn.model_selection import train_test_split

X_train = bow_transformed
y_train = train_df['emotion']
X_test = count_vect.transform(test_df['text'])
y_test = test_df['emotion']

In [None]:
# take a look at data dimension
print('X_train.shape: ', X_train.shape)
print('y_train.shape: ', y_train.shape)
print('X_test.shape: ', X_test.shape)
print('y_test.shape: ', y_test.shape)

In [None]:
print(train_df.shape)
print(test_df.shape)

#### 6.1 Deal with categorical label

In [None]:
# deal with label (string -> one-hot)

import keras
from keras import utils as np_utils
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
print('check label: ', label_encoder.classes_)
print('\n## Before convert')
print('y_train[0:4]:\n', y_train[0:8])
print('\ny_train.shape: ', y_train.shape)

def label_encode(le, labels):
    enc = le.transform(labels)
    return keras.utils.np_utils.to_categorical(enc)

def label_decode(le, one_hot_label):
    dec = np.argmax(one_hot_label, axis=1)
    return le.inverse_transform(dec)

y_train = label_encode(label_encoder, y_train)

#### 6.2 Build model

In [None]:
# I/O check
input_shape = X_train.shape[1]
print('input_shape: ', input_shape)

output_shape = len(label_encoder.classes_)
print('output_shape: ', output_shape)

In [None]:
from keras.models import Model
from keras.layers import Input, Dense
from keras.layers import ReLU, Softmax

# input layer
model_input = Input(shape=(input_shape, ))  # 500
X = model_input

# 1st hidden layer
X_W1 = Dense(units=64)(X)  # 64
H1 = ReLU()(X_W1)

# 2nd hidden layer
H1_W2 = Dense(units=64)(H1)  # 64
H2 = ReLU()(H1_W2)

# output layer
H2_W3 = Dense(units=output_shape)(H2)  # 4
H3 = Softmax()(H2_W3)

model_output = H3

# create model
model = Model(inputs=[model_input], outputs=[model_output])

# loss function & optimizer
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
# show model construction
model.summary()

In [None]:
from keras.callbacks import CSVLogger

csv_logger = CSVLogger('training_log.csv')

# training setting
epochs = 2
batch_size = 32

# training!
history = model.fit(X_train, y_train, 
                    epochs=epochs, 
                    batch_size=batch_size, 
                    callbacks=[csv_logger])

print('training finish')

#### 6.3  Predict on testing data

In [None]:
from sklearn.metrics import accuracy_score

# predict!
prediction = model.predict(X_test, batch_size=128)
prediction = label_decode(label_encoder, prediction)

In [None]:
# save as csv
test_df['emotion'] = prediction
output = test_df[['tweet_id', 'emotion']].copy()
output = output.set_axis(["id", "emotion"], axis=1)
output.to_csv("submission.csv", index=False)
output