In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/sentiment-analysis-of-tweets/train.txt')
train = np.array(train)
test = pd.read_csv('/kaggle/input/sentiment-analysis-of-tweets/test_samples.txt')
test = np.array(test)

**NAIVE BAYES CLASSIFIER (ONLY THIS APPROACH WAS SUBMITTED FOR KAGGLE COMPETITION)**



In [None]:
# Function to separate the columns of training and test set
def separate_columns(t):
    ID = []
    sen = []
    l = []
    r, c = t.shape
    
    # if it is a training data, separate it into 3 columns
    if c == 3:                  
        for i in range(0, r):
            ID.append(t[i][0])
            sen.append(t[i][1])
            l.append(t[i][2])
        return ID, sen, l
    
    # if it is a testing data, separate it into 2 columns
    elif c == 2:
        for i in range(0, r):
            ID.append(t[i][0])
            l.append(t[i][1])
        return ID, l

In [None]:
# Function for preprocessing the data
def preprocess(l):
    import nltk
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    from nltk.stem.wordnet import WordNetLemmatizer

    r = len(l)
    
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    lem = WordNetLemmatizer()
    stop_words = set(stopwords.words("english"))
    
    words = nltk.corpus.words.words()
    words = ' '.join(words)
    words = words.lower()
    words = words.split(' ')
    
    text = []
    for i in range(0, r):
        l1 = word_tokenize(l[i])                    # tokenize words
        l2 = []
        for w in l1:
            nop = ''
            for char in w:                           #remove punctuations
                if char not in punctuations:
                    nop += char
            l5 = nop.lower()
            l4 = lem.lemmatize(l5)                    # lemmatize words
            if len(l4) > 2:
                if l4 in words:                       # add only meaningful words
                    if l4 not in stop_words:          # remove stopwords and get filtered sentences with length greater than 2
                         l2.append(l4)
        text.append(l2)

    return text

In [None]:
# Function for counting sentiment frequencies in train data and store it in a dictionary
def count_sentiment(sen):
    sentiment = {}
    c1 = 0
    c2 = 0
    c3 = 0
    r = len(sen)
    for i in range(0,r):
        if sen[i]=='positive':
            c1+=1
        elif sen[i] == 'negative':
            c2 += 1
        elif sen[i] == 'neutral':
            c3 += 1
    sentiment.update({'positive':c1, 'negative':c2, 'neutral': c3})
    return sentiment

In [None]:
# Function for building vocabulary of train data
def vocabulary(txt):
    vocab = []
    for i in txt:
        for w in i:
            if w not in vocab:
                vocab.append(w)
    return vocab

In [None]:
# Function for getting frequency of every word in vocubulary for each class
# Then storing the frequencies in a 3 dictionaries for 3 classes
def freq_of_words_per_sentiment(sen, txt, vocab):
    dict_pos = {}
    dict_neu = {}
    dict_neg = {}

    r = len(sen)

    l1 = []
    l2 = []
    l3 = []

    for i in range(0,r):
        if sen[i] == 'positive':
            for w in txt[i]:
                l1.append(w)
        elif sen[i] == 'negative':
            for w in txt[i]:
                l2.append(w)
        elif sen[i] == 'neutral':
            for w in txt[i]:
                l3.append(w)

    str1 = ' '.join(l1)
    str2 = ' '.join(l2)
    str3 = ' '.join(l3)

    for w in vocab:
        a = str1.count(w)
        b = str2.count(w)
        c = str3.count(w)
        dict_pos.update({w:a})
        dict_neg.update({w:b})
        dict_neu.update({w:c})
    return dict_pos, dict_neg, dict_neu

In [None]:
# Function to count total number of words in each sentiment class
def no_of_words_per_sentiment(dict_pos, dict_neg, dict_neu):
    cpos = 0
    cneg = 0
    cneu = 0
    for i in dict_pos:
        cpos += dict_pos[i]
    for i in dict_neg:
        cneg += dict_neg[i]
    for i in dict_neu:
        cneu += dict_neu[i]
    return cpos, cneg, cneu

In [None]:
# Function to calculate probabitity of words in every sentiment
def prob_words(vocab, dict_pos, dict_neg, dict_neu):
    prob_dict_pos = {}
    prob_dict_neg = {}
    prob_dict_neu = {}
    
    cpos, cneg, cneu = no_of_words_per_sentiment(dict_pos, dict_neg, dict_neu)
    
    for i in dict_pos:
        p = (dict_pos[i] + 2)/(cpos + 2*len(vocab))
        prob_dict_pos.update({i:p})
    for i in dict_neg:
        p = (dict_neg[i] + 2)/(cneg + 2*len(vocab))
        prob_dict_neg.update({i:p})
    for i in dict_neu:
        p = (dict_neu[i] + 2)/(cneu + 2*len(vocab))
        prob_dict_neu.update({i:p})
    return prob_dict_pos, prob_dict_neg, prob_dict_neu

In [None]:
# Function to calculate the probability of occurence of sentiments
def prob_sentiments(sen):
    import math
    
    sentiment = count_sentiment(sen)
    r = len(sen)
    
    prob_pos = math.log((sentiment['positive']/r))
    prob_neg = math.log((sentiment['negative']/r))
    prob_neu = math.log((sentiment['neutral']/r))
    
    return prob_pos, prob_neg, prob_neu
 

In [None]:
# Function to train the data and predict the sentiment in for each sentence in the test data
def predict(train, test)  :  
    
    import math
    predict = []
    
    i_d, sen, l_train = separate_columns(train)
    ID, l = separate_columns(test)
    
    txt = preprocess(l)
    txt_train = preprocess(l_train)
    
    vocab = vocabulary(txt_train)
    dict_pos, dict_neg, dict_neu = freq_of_words_per_sentiment(sen, txt_train, vocab)

    prob_pos, prob_neg, prob_neu = prob_sentiments(sen)
    prob_dict_pos, prob_dict_neg, prob_dict_neu = prob_words(vocab, dict_pos, dict_neg, dict_neu)
    
    
    
    for i in txt:
        pos_score = 0
        neg_score = 0
        neu_score = 0 
        pos = 0
        neg = 0
        neu = 0
        for w in i:
            if w in vocab:
                    pos_score += math.log(prob_dict_pos[w])
                    neg_score += math.log(prob_dict_neg[w])
                    neu_score += math.log(prob_dict_neu[w])
        pos = pos_score + prob_pos
        neg = neg_score + prob_neg
        neu = neu_score + prob_neu
        m = max(pos ,neg ,neu )
        if m == pos:
            predict.append('positive')
        elif m == neg:
            predict.append('negative')
        elif m == neu:
            predict.append('neutral')
    return  ID, predict

In [None]:
ID , predict = predict(train, test) # THE OUTPUT WHICH IS PRINTED HERE WERE SUBMITTED AS PREDICTIONS FOR KAGGLE COMPETITION
n = len(ID) 
print('tweet_id,sentiment') 
for i in range(0,n): 
    print(ID[i],',',predict[i])

   **NEURAL NETWORKS APPROACH - LSTM, CNN AND DEEP NEURAL NETWORKS**
   
   **(I HAVE IMPLEMENTED IT, BUT THIS CODE WAS NOT SUBMITTED FOR KAGGLE COMPETITION)**

In [None]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM
from keras.layers import GlobalMaxPooling1D
from keras.models import Model
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.layers import Input
from keras.layers.merge import Concatenate

import pandas as pd
import numpy as np
import re
import tensorflow as tf
import matplotlib.pyplot as plt

In [None]:
def preprocess_text(sen):
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [None]:
I_D, y, sentences1 = separate_columns(train)
i_d, sentences2 = separate_columns(test)

In [None]:
X = []
X_test = []
for sen in sentences1:
    X.append(preprocess_text(sen))
for sen in sentences2:
    X_test.append(preprocess_text(sen))

In [None]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X)

X_train = tokenizer.texts_to_sequences(X)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 200

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

### integer mapping using LabelEncoder
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(y)

integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)

### One hot encoding
onehot_encoder = OneHotEncoder(sparse=False)
y = onehot_encoder.fit_transform(integer_encoded)

**LSTMs**

In [None]:
deep_inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(vocab_size, 100, trainable=False)(deep_inputs)
LSTM_Layer_1 = LSTM(128)(embedding_layer)
dense_layer_1 = Dense(3, activation='sigmoid')(LSTM_Layer_1)
model_LSTM = Model(inputs=deep_inputs, outputs=dense_layer_1)

model_LSTM.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [None]:
print(model_LSTM.summary())

In [None]:
history_LSTM = model_LSTM.fit(X_train, y, batch_size=128, epochs=5, verbose=1)

In [None]:
p1 = model_LSTM.predict(X_test)
for i in p1:
    a = max(i)
    i = list(i)
    if i.index(a) == 0:
        print('negative')
    elif i.index(a) == 1:
        print('neutral')
    else: 
        print('positive')

**CNN**

In [None]:
model_CNN = Sequential()

embedding_layer1 = Embedding(vocab_size, 100, input_length=maxlen , trainable=False)
model_CNN.add(embedding_layer1)

model_CNN.add(tf.keras.layers.Conv1D(128, 5, activation='relu'))
model_CNN.add(GlobalMaxPooling1D())
model_CNN.add(Dense(3, activation='sigmoid'))
model_CNN.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [None]:
print(model_CNN.summary())

In [None]:
history_CNN = model_CNN.fit(X_train, y, batch_size=128, epochs=50, verbose=1)

In [None]:
p2 = model_CNN.predict(X_test)
for i in p2:
    a = max(i)
    i = list(i)
    if i.index(a) == 0:
        print('negative')
    elif i.index(a) == 1:
        print('neutral')
    else: 
        print('positive')

**Deep Neural Networks**

In [None]:
model_deep = Sequential()
embedding_layer2 = Embedding(vocab_size, 100, input_length=maxlen , trainable=False)
model_deep.add(embedding_layer2)

model_deep.add(Flatten())
model_deep.add(Dense(3, activation='sigmoid'))

model_deep.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [None]:
print(model_deep.summary())

In [None]:
history_deep = model_deep.fit(X_train, y, batch_size=128, epochs=50, verbose=1)

In [None]:
p3 = model_deep.predict(X_test)
for i in p3:
    a = max(i)
    i = list(i)
    if i.index(a) == 0:
        print('negative')
    elif i.index(a) == 1:
        print('neutral')
    else: 
        print('positive')