In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/first-gop-debate-twitter-sentiment/Sentiment.csv')
df.head(3)

In [None]:
# id,tweet_created,tweet_id, can be deleted 

# **Preprocessing data**

In [None]:
df.info() #candidate_gold,relevant_yn_gold,sentiment_gold,subject_matter_gold,tweet_coord,tweet_location,user_timezone 

In [None]:
columns_to_drop = ['candidate_gold','relevant_yn_gold','sentiment_gold',
                   'subject_matter_gold','tweet_coord','tweet_location','user_timezone',
                   'id','tweet_created','tweet_id','name']
df.drop(labels=columns_to_drop,axis=1,inplace=True)
df.head(3)

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)
df.isnull().sum()

In [None]:
df['candidate'].unique() # can be encoded using label encoder ,if needed.

In [None]:
df['candidate_confidence'].describe() 

In [None]:
df.head(3)

In [None]:
df['relevant_yn'].nunique() # can be encoded using label encoder ,if needed.

In [None]:
df['relevant_yn_confidence'].describe() 

In [None]:
df.head()

In [None]:
df['sentiment'].unique()

In [None]:
df['sentiment_confidence'].describe()

In [None]:
df['subject_matter'].value_counts()

In [None]:
df['retweet_count'].describe()

In [None]:
df.corr()

In [None]:
df['text']

In [None]:
df['text'].iloc[0]

In [None]:
df['text'].iloc[1]

In [None]:
df['text'].iloc[2]

In [None]:
df.head(5)

In [None]:
import nltk
from nltk.corpus import stopwords

# tweets = []
stopwords_set = set(stopwords.words("english"))

def remove_stopwords(doc):
    words_filtered = [e.lower() for e in doc.split()]
    words_cleaned = [word for word in words_filtered
        if 'http' not in word
        and not word.startswith('@')
        and not word.startswith('#')
        and word != 'rt']
    doc_without_stopwords = ' '.join([word for word in words_cleaned if not word in stopwords_set])
    
    return doc_without_stopwords

In [None]:
df['text'].iloc[0:2]

In [None]:
df['text'] = df['text'].apply(remove_stopwords)

In [None]:
df['text'].iloc[0:2]

In [None]:
#this function is used to remove the punctuation in the text data
def remove_punctuations(doc):
    punctuations = """!()-[]{};:'"\,“”<>./?@#$%^&*_~"""
    #we add one more punctuation to our list as this punctuation mark was used multiple times in the text data
    punctuations += '�' 
    for p in punctuations:
      if p in doc:
        doc = doc.replace(p,"")
    return doc

In [None]:
df['text'] = df['text'].apply(remove_punctuations)

In [None]:
df['text'].iloc[0:2]

In [None]:
#this function will remove all the tokens which are not alphabatic
def remove_digits(doc):
    tokens = doc.split()
    result = ' '.join([i for i in tokens if i.isalpha()])
    return result

In [None]:
df['text'] = df['text'].apply(remove_digits)

In [None]:
df['text'].iloc[0:2]

In [None]:
#importing libraries for stemming
import re
import nltk
from nltk.stem import SnowballStemmer #general stemmer
print(" ".join(SnowballStemmer.languages))

In [None]:
#we will select the dutch language stemmer as out text is in dutch language
stemmer = SnowballStemmer("english")
# stemmer.stem(df['text'].iloc[0])
df['text'] = df['text'].apply(stemmer.stem)

In [None]:
df['sentiment'].unique()

**WordClouds of Positive and Negative Sentiments**

In [None]:
from wordcloud import WordCloud,STOPWORDS
from matplotlib import pyplot as plt

df_pos = df[df['sentiment'] == 'Positive']
df_pos = df_pos['text']
df_neg = df[df['sentiment'] == 'Negative']
df_neg = df_neg['text']

def wordcloud_draw(data, color = 'black'):
    words = ' '.join(data)
    wordcloud = WordCloud(stopwords=STOPWORDS,
                      background_color=color,
                      width=2500,
                      height=2000
                     ).generate(words)
    plt.figure(1,figsize=(13, 13))
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()
    
print("Positive words")
wordcloud_draw(df_pos,'white')
print("Negative words")
wordcloud_draw(df_neg)

In [None]:
df_final = df[df['sentiment'] != 'Neutral']

In [None]:
df_final['sentiment'].unique()

In [None]:
df_final['sentiment'] = df_final['sentiment'].apply(lambda x : 1 if x == 'Positive' else 0)

In [None]:
df_final.info()

In [None]:
#creating pradictor and target variable
X = df_final['text']
y = df_final['sentiment']

In [None]:
X

In [None]:
y

In [None]:
# spliting the dataset into test and train set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
#ml algorithm work with numbers only so we will convert our text feature in numeric form
#we will bag of words approach here
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(X) #creating corpus using the whole data

In [None]:
#transforming train data in numeric form with help of whole corpus
X_train = vectorizer.transform(X_train)
# print(vectorizer.get_feature_names())

In [None]:
X_train.shape

In [None]:
#transforming test data in numeric form with help of whole corpus
X_test = vectorizer.transform(X_test)

In [None]:
X_test.shape

# **Trying naive byes**

In [None]:
from sklearn.naive_bayes import GaussianNB
GNB_classifier = GaussianNB()

In [None]:
%%time
#fitting the train dataset
GNB_classifier.fit(X_train.toarray(),y_train)

In [None]:
%%time
#finding accuracy of the model
GNB_classifier.score(X_test.toarray(), y_test)

# **Trying support vector classifier**

In [None]:
#importing support vector machine algorithm from sklearn library
from sklearn.svm import SVC
svm_clf_model = SVC()

In [None]:
%%time
#fitting the train dataset
svm_clf_model.fit(X_train.toarray(),y_train)

In [None]:
%%time
#finding accuracy of the model
svm_clf_model.score(X_test.toarray(), y_test)

# **Trying another support vector classifier**

In [None]:
#importing support vector machine algorithm from sklearn library
from sklearn.svm import SVC
svm_clf_model1 = SVC(C=100,gamma=1,kernel='sigmoid')

In [None]:
%%time
#fitting the train dataset
svm_clf_model1.fit(X_train.toarray(),y_train)

In [None]:
%%time
#finding accuracy of the model
svm_clf_model1.score(X_test.toarray(), y_test)

# **Trying passive aggressive classifer**

https://www.geeksforgeeks.org/passive-aggressive-classifiers/

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=50)

In [None]:
%%time
pac.fit(X_train.toarray(),y_train)

In [None]:
pac.score(X_test.toarray(), y_test)

# **By Selecting various number of features only**

**Expermenting**
1. By changing number of features
2. By doing n-grams and varying number of feature
3. By topic modelling or so
--Try doing some topic modelling like latent Dirichlet allocation or Probabilistic latent Semantic Analysis for the corpus using a specified number of topics - say 20. You would get a vector of 20 probabilities corresponding to the 20 topics for each document. You could use that vector as input for your classification or use it as additional features on top of what you already have from your base model enhanced with bigrams and trigrams.

Source : https://datascience.stackexchange.com/questions/19276/improving-accuracy-of-text-classification

In [None]:
# spliting the dataset into test and train set
from sklearn.model_selection import train_test_split
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2),max_features=6000) #max_features=2000,
vectorizer.fit(X)

In [None]:
#transforming train data in numeric form with help of whole corpus
X_train1 = vectorizer.transform(X_train1)
# print(vectorizer.get_feature_names())

In [None]:
X_train1.shape

In [None]:
#transforming train data in numeric form with help of whole corpus
X_test1 = vectorizer.transform(X_test1)
# print(vectorizer.get_feature_names())

In [None]:
X_test1.shape

In [None]:
from sklearn.svm import SVC
svm_clf_model = SVC()

In [None]:
%%time
#fitting the train dataset
svm_clf_model.fit(X_train1.toarray(),y_train1)

In [None]:
%%time
#finding accuracy of the model
svm_clf_model.score(X_test1.toarray(), y_test1)

In [None]:
y_pred1 = svm_clf_model.predict(X_test1.toarray())

In [None]:
from sklearn.metrics import confusion_matrix
cf_mat = confusion_matrix(y_test1, y_pred1,labels=[0,1])
cf_mat

In [None]:
print(f'True 0 are {(y_test1==0).sum()} True predicted 0 are {cf_mat[0][0]}')
print(f'True 1 are {(y_test1==1).sum()} True predicted 1 are {cf_mat[1][1]}')

In [None]:
# mode is too bias towards negative class(class 0) as dataset is imbalance

In [None]:
print((y==0).sum())
print((y[y==0].count()))


**Model Evaluation is Pending**
--> Topics to be covered

1. Classification accuracy
2. Confusion matrix
3. Precision and recall
4. F1 score
5. Sensitivity and specificity
6. ROC curve and AUC

**MORE TO TRY**
https://datascience.stackexchange.com/questions/19276/improving-accuracy-of-text-classification

**Source for Learn Word Embeddings**
https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/text/word_embeddings.ipynb#scrollTo=Q6mJg1g3apaz

https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture

https://www.shanelynn.ie/word-embeddings-in-python-with-spacy-and-gensim/

https://kavita-ganesan.com/easily-access-pre-trained-word-embeddings-with-gensim/

In [None]:
# Another way to think of an embedding is as "lookup table". 
# pretrained word embedding provided by --> Gensim,Spacy.

# The disadvantage of pre-trained word embeddings is that the words contained within may not capture 
# the peculiarities of language in your specific application domain

# The vectors can be accessed directly using the .vector attribute of each processed token (word).
# The mean vector for the entire sentence is also calculated simply using .vector, 
# providing a very convenient input for machine learning models based on sentences.

# phrase detection in gensim using
# from gensim.models.phrases import Phraser, Phrases

# gensim appears to be a popular NLP package, and has some nice documentation and tutorials, including for word2vec.
# Source : https://www.shanelynn.ie/word-embeddings-in-python-with-spacy-and-gensim/

# **Sentiment Analysis using Deep Learning**

https://towardsdatascience.com/all-you-need-to-know-about-rnns-e514f0b00c7c

https://medium.com/deep-learning-with-keras/lstm-understanding-the-number-of-parameters-c4e087575756#:~:text=LSTM%20layer%20has%20%E2%80%9Cdimensionality%20of,vector%20with%20dimension%203%20(feature)

https://colah.github.io/posts/2015-08-Understanding-LSTMs/

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils.np_utils import to_categorical

In [None]:
max_fatures = 3000
tokenizer = Tokenizer(num_words=max_fatures)

In [None]:
tokenizer.fit_on_texts(df_final['text'].values)

In [None]:
X = tokenizer.texts_to_sequences(df_final['text'].values)
type(X)

In [None]:
X[0],X[1],X[2],X[3],X[4],X[5],X[6]

In [None]:
X = pad_sequences(X)

In [None]:
X[0],X[1],X[2],X[3],X[4],X[5],X[6]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,y, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
embed_dim = 128
lstm_out = 32

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(LSTM(lstm_out))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
model.summary()

In [None]:
# output size = 32
batch_size = 64
epochs = 10
model.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size)

In [None]:
# output size = 64
batch_size = 64
epochs = 10
model.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size)

# output size = 128
batch_size = 64
epochs = 10
model.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size)

Source for below
https://www.kaggle.com/jaydeepbhalala/gensim-word2vec-tutorial/edit

https://stackoverflow.com/questions/42064690/using-pre-trained-word2vec-with-lstm-for-word-generation

https://www.kaggle.com/guichristmann/lstm-classification-model-with-word2vec