In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.io as pio
pio.templates.default = "plotly_dark"
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as ps

import re
import string

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from wordcloud import WordCloud, STOPWORDS

import nltk
# nltk.download('punkt')
# nltk.download('omw-1.4')
# nltk.download('stopwords')

import warnings
warnings.filterwarnings("ignore")

# Data Importing

In [None]:
df = pd.read_csv("Tweets.csv")

In [None]:
df.head()

In [None]:
df.info()

# preprocessing

In [None]:
df.dropna(inplace= True)

In [None]:
df1 = df.drop(["textID" , "selected_text"] , axis = 1)

In [None]:
_stopwords = set(STOPWORDS).union(stopwords.words('english'))

In [None]:
_stopwords.add("like")

In [None]:
# removes text from tweets 
def preprocessor(text):
    text = re.sub('[^a-zA-Z]',' ',text)
    text = text.lower()
    text = text.strip()
    text = ' '.join([WordNetLemmatizer().lemmatize(word,pos='v') for word in word_tokenize(text)])    
    text = ' '.join([word for word in word_tokenize(text) if word not in _stopwords])
    return text

In [None]:
df1["text"] = df1["text"].apply(preprocessor)

In [None]:
df1.insert(2,"label" , df1.sentiment.map({"neutral" : 0, "negative": -1 , "positive" : 1}))

In [None]:
# df1.to_csv("Tweets_preprocessed.csv")

In [None]:
df1.head()

In [None]:
df1.tail()

# Data Analyzing 

In [None]:
colormap = {"positive" : "white" ,
           "negative": "red",
           "neutral" : "blue"}

## Label Stats

In [None]:
df1.describe()

In [None]:
px.pie(df , names = "sentiment" , values = np.ones_like(df1.sentiment) ,color = "sentiment" , color_discrete_map = colormap , hole= 0.3 )

## Word frequency

In [None]:
# converts the data to one string
def all_text(text_df):
    combine = ""
    for l in text_df.values:
        combine += ' '.join(x for x in l.split() if len(x) > 3  )
    return combine

In [None]:
# creates a word frequency series
def word_freq(text_df):
    freq = nltk.FreqDist(all_text(text_df).split())
    freq_df = pd.DataFrame({"words" : list(freq.keys()),
                           "count" : list(freq.values())})
    return freq_df.sort_values("count" , ascending= False)

In [None]:
neut_freq = word_freq(df1[df1.sentiment == "neutral"]["text"])

pos_freq =  word_freq(df1[df1.sentiment == "positive"]["text"])

neg_freq =  word_freq(df1[df1.sentiment == "negative"]["text"])

In [None]:
# create a wordcloud image
def draw_wordcloud(words_df , title , color):
    plt.imshow(WordCloud(stopwords = _stopwords, background_color = color).fit_words(words_df.set_index("words").to_dict()["count"]))
    plt.title(title)
    plt.axis("off")

In [None]:
plt.figure(figsize= (20 , 30) ,facecolor='black')
plt.subplot(1, 3 ,1)
draw_wordcloud(neg_freq , "negitive words" , colormap["negative"])

plt.subplot(1, 3 ,2)
draw_wordcloud(neut_freq , "neutral words" , colormap["neutral"])

plt.subplot(1 , 3 , 3)
draw_wordcloud(pos_freq , "positive words" , colormap["positive"])

plt.show()

In [None]:
neut_freq.insert(1 , "sentiment" ,"neutral" )
neg_freq.insert(1 , "sentiment" ,"negative" )
pos_freq.insert(1 , "sentiment" ,"positive" )

In [None]:
freq_words = pd.concat([ neut_freq[: 10] , neg_freq[: 10] ,pos_freq[: 10]])

In [None]:
px.sunburst(freq_words ,path= ["sentiment" , "words"],values = "count" , color = "sentiment" , color_discrete_map = colormap )

## Text stats

In [None]:
def get_lengths(df , sentiment):
    twt_lengths = []
    wrd_length = []
    for twt in df[df.sentiment == sentiment].text:
        twt_splt = twt.split()
        twt_lengths.append(len(twt_splt))
        for w in twt_splt:
            wrd_length.append(len(w))
    _df = pd.DataFrame({"twt_length" : twt_lengths})
    _df.insert(1 , "sentiment" ,sentiment )
    _df2 = pd.DataFrame({"wrd_length" : wrd_length}) 
    _df2.insert(1 , "sentiment" ,sentiment )
    return _df , _df2

In [None]:
neut_len = get_lengths(df , "neutral")
pos_len =  get_lengths(df , "positive")
neg_len = get_lengths(df , "negative")

twt_len = pd.concat([neut_len[0] , pos_len[0] ,  neg_len[0]])
wrd_len = pd.concat([neut_len[1] , pos_len[1] ,  neg_len[1]])

In [None]:
fig = px.histogram(twt_len , x= 'twt_length'  , color="sentiment", marginal="box" , color_discrete_map= colormap , title = "tweets length")
fig.update_layout(
    updatemenus=[
        dict(
            type="buttons",
            bgcolor = "rgb(17,17,17)",
            direction="right",
            font = {"color": "white"},
            showactive = False,
            x=1.15,
            y=0,
            buttons=list(
                [
                    dict(
                        label="tweets / words",
                        method="update",
                        args=[{"x": [twt_len["twt_length"]] },{"title" : "tweets length" , "xaxis": {'title': "twt_length"}}], # , "xaxis":[{'title': {'text': 'twt_length'}}]
                        args2 = [{"x": [wrd_len["wrd_length"]]},{"title" : "words length" , "xaxis": { 'title': "wrd_length"}}]
                    ),
                ]
            ),
        )
    ]
)

## objects deletion 

In [None]:
# free space for model calculation

del STOPWORDS
del fig
del freq_words
del neg_freq
del neg_len
del neut_freq
del neut_len
del pos_freq
del pos_len
del stopwords
del twt_len
del wrd_len
del _stopwords

# Model creation

In [None]:
from sklearn.model_selection import train_test_split , cross_validate , GridSearchCV
from sklearn.metrics import classification_report , confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import joblib

In [None]:
def model_analasys(y_true , y_pred):
    labels = ["negative" , "neutral" , "positive"]
    print(classification_report(y_true= y_true , y_pred = y_pred ,target_names= labels))
    fig = px.imshow(confusion_matrix(y_true= y_true , y_pred = y_pred ) ,x = labels , y = labels ,labels = {"x" : "predicted" , "y" : "target"} , color_continuous_scale= px.colors.sequential.PuBu)
    fig.show()

## data preparation 

In [None]:
X = df1.text
y = df1.label

### TF IDF

In [None]:
tf_idf = TfidfVectorizer()
X = tf_idf.fit_transform(X)

In [None]:
X.shape

### SVD

In [None]:
# svd = TruncatedSVD(10_000)
# svd.fit(X)

# exp_var_cumul = np.cumsum(svd.explained_variance_ratio_)
# np.save("exp_var_cumul" , exp_var_cumul)

exp_var_cumul = np.load("exp_var_cumul.npy")

px.area(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=exp_var_cumul,
    labels={"x": "# Components", "y": "Explained Variance"}
)

In [None]:
svd = TruncatedSVD()
X_2d = svd.fit_transform(X)

In [None]:
px.scatter(x = X_2d[: ,0] , y = X_2d[: ,1]  , color= df1.sentiment , color_discrete_map= colormap)

In [None]:
svd = TruncatedSVD(4_000)
X = svd.fit_transform(X)
np.save("data_preprocessed",X)

### load preprocessed data

In [None]:
X = np.load("data_preprocessed.npy")

### Train Test Split

In [None]:
X_train , X_test , y_train , y_test =  train_test_split(X , y , test_size= 0.25 , random_state= 42)

In [None]:
total_num_of_samples = X.shape[0]
print("           | features | samples | %")
print("Data size  | " ,X.shape[1] , "   | " , X.shape[0] , " | " , X.shape[0]/total_num_of_samples )
print("Train size | " ,X_train.shape[1] , "   | " , X_train.shape[0] , " | " , X_train.shape[0]/total_num_of_samples )
print("Test size  | " ,X_test.shape[1] , "   | " , X_test.shape[0] , "  | " , X_test.shape[0]/total_num_of_samples )

## basic models

### model 0

In [None]:
def model_0_fit(label):
    def model(x):
        return np.random.choice(label.unique() , size= x.shape[0] , p= label.value_counts() / label.size)
    return model 

In [None]:
model_0 = model_0_fit(y_train)

In [None]:
y_pred_0 = model_0(X_test)

In [None]:
model_analasys(y_test , y_pred_0)

### static model

In [None]:
def nltk_model(X , th1 , th2):
    sen = SentimentIntensityAnalyzer()
    y = []
    for i in X:
        s = sen.polarity_scores(i)
        if s['compound'] < th1:
            y.append(-1)
        elif s['compound'] > th2:
            y.append(1)
        else:
            y.append(0)
    return y

In [None]:
y_pred_nltk = nltk_model(df.text , -0.01 , 0.01)

In [None]:
model_analasys(y , y_pred_nltk)

## Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier()
rf_param = {
    "n_estimators" : [50, 100, 150]
}
rf_gcv = GridSearchCV(estimator= rf , param_grid= rf_param , n_jobs= -1 , verbose= 3 , cv = 3)

In [None]:
rf_gcv.fit(X_train , y_train)

In [None]:
pd.DataFrame(rf_gcv.cv_results_)

In [None]:
model_analasys(y_test , rf_gcv.predict(X_test))

### Save model

In [None]:
joblib.dump(rf_gcv.best_estimator_ , "RandomForest_Model.pkl")

### Load model

In [None]:
rf_gcv = joblib.load("RandomForest_Model.pkl")

## SVC

In [None]:
from sklearn.svm import SVC

In [None]:
svc =  SVC()
svc_param = {
    "C" : [1, 2],
    "kernel":["poly", "rbf"]
}
svc_gcv = GridSearchCV(estimator= svc , param_grid= svc_param , n_jobs= -1 , verbose= 3 , cv = 3)

In [None]:
svc_gcv.fit(X_train , y_train)

In [None]:
pd.DataFrame(svc_gcv.cv_results_)

In [None]:
model_analasys(y_test , svc_gcv.predict(X_test))

### Save model

In [None]:
joblib.dump(svc_gcv.best_estimator_ , "SVC_Model.pkl")

### Load model

In [None]:
svc_gcv = joblib.load("SVC_Model.pkl")

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier()
knn_param = {
    "n_neighbors" : [5 ,10 , 15 , 20 , 25 ,30, 35, 40, 45, 50],
}
knn_gcv = GridSearchCV(estimator= knn , param_grid= knn_param , n_jobs= -1 , verbose= 3 , cv = 3)

In [None]:
knn_gcv.fit(X_train , y_train)

In [None]:
pd.DataFrame(knn_gcv.cv_results_)

In [None]:
model_analasys(y_test , knn_gcv.predict(X_test))

### Save model

In [None]:
joblib.dump(knn_gcv.best_estimator_ , "KNN_Model.pkl")

### Load model

In [None]:
knn_gcv = joblib.load("KNN_Model.pkl")

## Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression()
lr_param = {
    "penalty" : ["l2", "elasticnet"]
}
lr_gcv = GridSearchCV(estimator= lr , param_grid= lr_param , n_jobs= -1 , verbose= 3 , cv = 3)

In [None]:
lr_gcv.fit(X_train , y_train)

In [None]:
pd.DataFrame(lr_gcv.cv_results_)

In [None]:
model_analasys(y_test , lr_gcv.predict(X_test))

### Save model

In [None]:
joblib.dump(lr_gcv.best_estimator_ , "LogisticRegression_Model.pkl")

### Load model

In [None]:
lr_gcv = joblib.load("LogisticRegression_Model.pkl")

## GaussianNB

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
gnb = GaussianNB()

In [None]:
gnb_cv = cross_validate(gnb,X_train , y_train , cv = 3 , verbose= 1)

In [None]:
pd.DataFrame(gnb_cv)

In [None]:
gnb.fit(X_train , y_train)

### Save model

In [None]:
joblib.dump(gnb , "GaussianNB_Model.pkl")

### Load model

In [None]:
gnb = joblib.load("GaussianNB_Model.pkl")

## Voting

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
import tensorflow as tf
from tensorflow.keras.layers import LSTM , Embedding , Dense , Dropout , Activation , GlobalMaxPool1D
from tensorflow.keras import Sequential,optimizers , Input , regularizers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
def draw_lines(y1 , y2 , lbl1 , lbl2 , x):
    plt.figure(figsize= (20 , 10), facecolor = "black" )
    ax = plt.axes(facecolor = "black")
    epochs_num = range(1 , x+1)
    plt.plot(epochs_num, y1, '-r', label= lbl1)
    plt.plot(epochs_num , y2, '-b', label= lbl2)
    ax.tick_params(axis='x', colors='white')    
    ax.tick_params(axis='y', colors='white')
    ax.spines['left'].set_color('white')   
    ax.spines['bottom'].set_color('white')  
    plt.legend()
    plt.show()

In [None]:
def softmax_prediction(y_pred):
    y_new = []
    for p in y_pred:
        l = [0, 0, 0]
        l[np.argmax(p)] = 1
        y_new.append(l)
    return y_new

### data preparation

In [None]:
def models_to_df(models, X, y):
    X_new = [m.predict_proba(X) for m in models] 
    X_new.append(y.reshape(X_new[0].shape[0] , 1))
    columns = ["{0}_{1}".format(type(m).__name__,i) for m in models for i in range(-1 , 2)]
    columns.append("labels")
    df =  pd.DataFrame(np.concatenate(X_new , axis = 1) ,columns = columns )
    return df

In [None]:
models_proba = [knn_gcv  , lr_gcv  , rf_gcv , gnb]

In [None]:
df_pred_prob_train = models_to_df(models_proba , X_train , np.array(y_train))

In [None]:
px.imshow(df_pred_prob_train.corr())

In [None]:
X_train_v = df_pred_prob_train.drop("labels",axis  = 1)
y_train_v = to_categorical(df_pred_prob_train.labels , 3)

In [None]:
df_pred_prob_test = models_to_df(models_proba , X_test , np.array(y_test))

In [None]:
X_test_v = df_pred_prob_train.drop("labels",axis  = 1)
y_test_v = to_categorical(df_pred_prob_train.labels , 3)

### Network creation

In [None]:
d1_v = Dense(100, activation='relu' , kernel_regularizer = regularizers.l1_l2(l1=1e-5, l2=1e-4))
d2_v = Dense(50, activation='relu' , kernel_regularizer = regularizers.l1_l2(l1=1e-5, l2=1e-4))
d3_v = Dense(25, activation='relu' , kernel_regularizer = regularizers.l1_l2(l1=1e-5, l2=1e-4))
d4_v = Dense(3, activation='softmax')

In [None]:
network_v = [d1_v ,Dropout(0.2) , d2_v , Dropout(0.3) , d3_v , d4_v]

In [None]:
model_v = Sequential()
model_v.add(Input(shape=(df_pred_prob_train.shape[1]-1,)))
for layer in network_v:
    model_v.add(layer)
adam = optimizers.Adam()
model_v.compile(optimizer = adam, loss = 'categorical_crossentropy', metrics = ['accuracy'])

model_v.name = "Voting"

print(model_v.summary())

In [None]:
stopping = EarlyStopping(monitor='val_loss' , patience= 2 ,restore_best_weights = True)

### Training

In [None]:
model_v.fit(x= X_train_v , y= y_train_v ,  epochs= 20 ,validation_split= 0.15 ,callbacks= stopping)

In [None]:
history_v = pd.DataFrame(model_v.history.history)

In [None]:
draw_lines(history_v.loss , history_v.val_loss , "loss" , "val loss" , history_v.shape[0])

In [None]:
draw_lines(history_v.accuracy , history_v.val_accuracy , "accuracy" , "val accuracy" ,history_v.shape[0])

### Testing

In [None]:
y_pred_v = model_v.predict(X_test_v)
y_pred_v = softmax_prediction(y_pred_v)

In [None]:
model_analasys(y_test_v.argmax(axis=1) , np.array(y_pred_v).argmax(axis=1))

## LSTM

### data preparation 

In [None]:
vocabsize = 20000

In [None]:
tok = Tokenizer(num_words= vocabsize)
tok.fit_on_texts(df1.text)
seqs = tok.texts_to_sequences(df1.text)

In [None]:
X_lstm = pad_sequences(seqs , padding= "post")
y_lstm = to_categorical(y , 3)

In [None]:
maxlen = X_lstm.shape[1]

In [None]:
X_train_lstm , X_test_lstm , y_train_lstm , y_test_lstm = train_test_split(X_lstm, y_lstm, test_size= 0.25, random_state= 42)

### Network creation

In [None]:
embedding = Embedding(input_dim= vocabsize , output_dim = 50 )
lstm = LSTM(100 ,return_sequences= True)
gmp = GlobalMaxPool1D()
d1_lstm = Dense(50, activation='relu' , kernel_regularizer = regularizers.l1_l2(l1=1e-5, l2=1e-4))
d2_lstm = Dense(25, activation='relu' , kernel_regularizer = regularizers.l1_l2(l1=1e-5, l2=1e-4))
d3_lstm = Dense(3, activation='softmax')

In [None]:
network_lstm = [embedding , lstm ,gmp, d1_lstm ,Dropout(0.2) , d2_lstm , Dropout(0.3) , d3_lstm]

In [None]:
model_lstm = Sequential()
model_lstm.add(Input(shape=(maxlen,)))
for layer in network_lstm:
    model_lstm.add(layer)
rmp = optimizers.RMSprop(learning_rate= 15e-5 ,momentum=0.01 ,decay= .0001)
model_lstm.compile(optimizer = rmp, loss = 'categorical_crossentropy', metrics = ['accuracy'])

model_lstm.name = "LSTM"
print(model_lstm.summary())


### Training

In [None]:
model_lstm.fit(x= X_train_lstm , y= y_train_lstm ,  epochs= 20 ,validation_split= 0.15 ,callbacks= stopping)

In [None]:
history_lstm = pd.DataFrame(model_lstm.history.history)

In [None]:
draw_lines(history_lstm.loss , history_lstm.val_loss , "loss" , "val loss" , history_lstm.shape[0])

In [None]:
draw_lines(history_lstm.accuracy , history_lstm.val_accuracy , "accuracy" , "val accuracy" ,history_lstm.shape[0])

### Testing

In [None]:
y_pred_lstm = model_lstm.predict(X_test_lstm)
y_pred_lstm = softmax_prediction(y_pred_lstm)

In [None]:
model_analasys(y_test_lstm.argmax(axis=1) , np.array(y_pred_lstm).argmax(axis=1))

# results

In [None]:
models = models_proba.copy()
models.append(svc_gcv)
models.append(model_lstm)
models.append(model_v)
names = [type(m).__name__ for m in models]

In [None]:
from sklearn.metrics import accuracy_score , recall_score , precision_score

In [None]:
def prob_to_label(prob):
    return np.argmax(np.array(prob) , axis = 1) -1

In [None]:
y_preds = []
for i , m in enumerate(models_proba):
    y_preds.append(prob_to_label(df_pred_prob_test.iloc[: , i*3:i*3+3]))

In [None]:
y_preds.append(svc_gcv.predict(X_test))
y_preds.append(y_pred_lstm)
y_preds.append(y_pred_v)

In [None]:
accuracy_scores = []
recall_scores = []
precision_scores = []
for i in range(5):
    accuracy_scores.append(accuracy_score(y_test , y_preds[i]))
    recall_scores.append(recall_score(y_test , y_preds[i] , average= "macro"))
    precision_scores.append(precision_score(y_test , y_preds[i] , average= "macro"))

accuracy_scores.append(accuracy_score(y_test_lstm , y_pred_lstm))
recall_scores.append(recall_score(y_test_lstm , y_pred_lstm , average= "macro"))
precision_scores.append(precision_score(y_test_lstm , y_pred_lstm , average= "macro"))

accuracy_scores.append(accuracy_score(y_test_v , y_pred_v))
recall_scores.append(recall_score(y_test_v , y_pred_v , average= "macro"))
precision_scores.append(precision_score(y_test_v , y_pred_v , average= "macro"))


In [None]:
import plotly.subplots as ps

In [None]:
t1 = go.Bar(x = names,y = accuracy_scores, name = "Accuracy" )
t3 = go.Bar(x = names,y = recall_scores, name = "Recall")
t2 = go.Bar(x = names,y = precision_scores, name = "Precision")
data = [t1 , t2 , t3]
layout = go.Layout(title= "Results")
fig = go.Figure(data , layout)
fig.show()

In [None]:
t1 = go.Bar(x = names,y = accuracy_scores, name = "Accuracy" , marker_color= list(range(len (names))))
data = [t1 , t2 , t3]
layout = go.Layout(title= "Accuracy")
fig = go.Figure(t1 , layout)
fig.update_layout(
    updatemenus=[
        dict(
            type="buttons",
            bgcolor = "rgb(17,17,17)",
            direction="down",
            font = {"color": "white"},
            showactive = False,
            x=1.10,
            y=0.5,
            buttons=list(
                [
                    dict(
                        label="recall",
                        method="update",
                        args=[{"y": [recall_scores] },{"title" : "Recall"}], # , "xaxis":[{'title': {'text': 'twt_length'}}]
                    ),
                    dict(
                        label="precision",
                        method="update",
                        args=[{"y": [precision_scores] },{"title" : "Precision"}], # , "xaxis":[{'title': {'text': 'twt_length'}}]
                    ),
                    dict(
                        label="accuracy",
                        method="update",
                        args=[{"y": [accuracy_scores] },{"title" : "Accuracy"}], # , "xaxis":[{'title': {'text': 'twt_length'}}]
                    ),
                    
                ]
            ),
        )
    ]
)
fig.show()