# Sentiment Analysis
#### By Rohit Suryavanshi
Work Timeline doc: https://docs.google.com/document/d/1IXqmf_iipkzNDwZqWczPdeCnnNbQvESU1ktOREng1D0/edit?usp=sharing

In [2]:
import numpy as np
import pandas as pd
import math
import nltk
nltk.download("popular")
#nltk.download()

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

True

In [9]:
#url = 'https://raw.githubusercontent.com/rohitsurya26/WiDS-/main/training.csv?token=GHSAT0AAAAAABQZ6QUJPBNBJ26UXSIVGRJ6YPM6TUQ'
df = pd.read_csv('training.csv')

FileNotFoundError: ignored

### Labels:

0.   sadness
1.   joy
2.   love
3.   anger
4.   fear
5.   surprise

In [6]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.tokenize import sent_tokenize

# Pre-Processing

## Remove Punctuations
#### Ref: https://studymachinelearning.com/text-preprocessing-removal-of-punctuations/

In [7]:
import string
regular_punct = list(string.punctuation)
def remove_punctuation(text):
    punct_list = regular_punct
    for punc in punct_list:
        if punc in text:
            text = text.replace(punc, ' ')
    return text.strip()
df['punc_text']=df['text'].apply(remove_punctuation)

## Tokenising Text

In [8]:
#nltk.download('punkt')
df['tokenised']=df.apply(lambda row: nltk.word_tokenize(row['punc_text']), axis=1)

## Lemmatization

In [9]:
from nltk.stem import PorterStemmer
porter_stemmer=PorterStemmer()
from nltk.stem import WordNetLemmatizer
#nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

In [10]:
text=df['tokenised']
text=text.tolist()
for i in range(len(text)):
    text[i]=[lemmatizer.lemmatize(word=word,pos='v') for word in text[i]]
df['lemmatized']= pd.Series((i for i in text))

## Removal of Stopwords
##### Ref: https://stackabuse.com/removing-stop-words-from-strings-in-python/#usingpythonsnltklibrary¶
##### Ref: https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe

In [11]:
#nltk.download('stopwords')
stop = stopwords.words('english')
df['string_text']= df['lemmatized'].str.join(" ")
df['cleaned_text'] = df['string_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

### New Dataset with Pre-Processed Text

In [12]:
new_df = pd.DataFrame(df['cleaned_text'])
new_df['label']=df['label']
new_df.to_csv('pre-processed_data.csv',index=False)

## Implementation

In [13]:
## Splitting into training and test(C.V.) data set
n=len(new_df['label'])
idx_train = np.random.choice(n,int(n),replace=False) ## Gives us a list
#idx_test = np.random.choice(n,int(n*0.2),replace=False) ## int bcuz n*0.2 is float

In [14]:
df_train=new_df.loc[idx_train]
x_train=df_train['cleaned_text']
y_train=df_train['label']

In [None]:
# df_train,df_test=new_df.loc[idx_train],new_df.loc[idx_test]
# x_train,x_test=df_train['cleaned_text'],df_test['cleaned_text']
# y_train,y_test=df_train['label'],df_test['label']

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

corpus_train=x_train.tolist()
#corpus_test=x_test.tolist()

vectorizer =  CountVectorizer() 
#vectorizer =  TfidfVectorizer()

X_train = vectorizer.fit_transform(corpus_train)
#X_test = vectorizer.transform(corpus_test)
#type(X_train)
vocab=vectorizer.get_feature_names_out()
vocab_size=len(vocab)

# Deep Learning
### Used Tensorflow and Keras

In [16]:
import tensorflow 
from tensorflow import keras

In [17]:
## Max length(words) in a sentence
print(max(corpus_train))
max_length=24 ## counted by hand

zoom difficulties feel like give everything feel helpless alone desert cast ways voice action others another story zoom also temporarily loose view full picture


### Used One-hot Encoding and Padding to input it into Embedding Layer

In [None]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

encoded=[one_hot(i,vocab_size) for i in corpus_train]
padded=pad_sequences(encoded,maxlen=max_length,padding='post')

In [None]:
x=padded
y=y_train.values

### Developing Model

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.layers import Conv1D

embeded_vector_size=50
model = Sequential()
model.add(Embedding(vocab_size, embeded_vector_size, input_length=max_length,name="embedding"))
#model.add(Flatten())
model.add(Conv1D())
model.add(GlobalAveragePooling1D())
model.add(Dense(units=16,activation='relu'))
model.add(Dense(6, activation='softmax'))

NameError: ignored

In [None]:
model.compile(optimizer='adam',loss=keras.losses.SparseCategoricalCrossentropy(),metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 24, 50)            605350    
                                                                 
 global_average_pooling1d (G  (None, 50)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 16)                816       
                                                                 
 dense_1 (Dense)             (None, 6)                 102       
                                                                 
Total params: 606,268
Trainable params: 606,268
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(x, y, epochs=2, batch_size=1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f66fd7f5410>

In [None]:
loss,accuracy=model.evaluate(x,y)
accuracy

In [None]:
url='https://raw.githubusercontent.com/rohitsurya26/WiDS-/main/test.csv?token=GHSAT0AAAAAABQZ6QUJJ4BQCEAY3FGG2FSAYPK2AOA'
df_test=pd.read_csv(url)
## REMOVE PUNCTUATION
df_test['text']=df_test['text'].apply(remove_punctuation)
## TOKENISATION
df_test.text=df_test.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)
## LEMMATIZATION
submit=df_test['text']
submit=submit.tolist()
for i in range(len(submit)):
    submit[i]=[lemmatizer.lemmatize(word=word,pos='v') for word in submit[i]]
## REMOVE STOPWORDS
submit=pd.Series(submit)
df_test['string_text']= submit.str.join(" ")
df_test['cleaned_text'] = df_test['string_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
submit=df_test['cleaned_text']

In [None]:
corpus_submit=submit.tolist()
 
vectorizer =  CountVectorizer() 

submit_hmm = vectorizer.fit_transform(corpus_submit)
#X_test = vectorizer.transform(corpus_test)
#type(X_train)
vocab=vectorizer.get_feature_names_out(submit_hmm)
vocab_size=len(vocab)

In [None]:
#print(max(corpus_submit))
max_length=24

In [None]:
encoded_submit=[one_hot(i,vocab_size) for i in corpus_submit]
submit=pad_sequences(encoded_submit,maxlen=max_length,padding='post')
#submit=encoded_submit

In [None]:
p=model.predict(submit)
#print(p[:7])
# p[np.where(a==np.max(p))] = 1
b = np.zeros_like(p)
b[np.arange(len(p)), p.argmax(1)] = 1
kaggle=b
# kaggle=np.round(p)
# kaggle = p
#print(kaggle[0:7])
def labelling(matrix):
    n=len(matrix)
    lst=[1]*n
    for i in range(n):
        arr=matrix[i]
        arr=list(arr)
        idx=arr.index(max(arr))
        lst[i]=idx        
    return lst
a=labelling(kaggle)        


In [None]:
kaggle=pd.DataFrame(columns=['id','label'])
n=len(a)
k=np.arange(n)+1
kaggle['id']=pd.Series(k)
kaggle['label']=pd.Series(a)
len(kaggle['id'])
kaggle.to_csv('submission_tensor.csv',index=False)

# Naive Bayes Implementation

## Basic Way to Approach
1. Splitting dataframe in 80-20 way for training and test(C.V.) datasets
2. Import TF-IDF vectorizer and corpus from x_train and x_test
3. Apply vectorizer on corpus and obtain array containing TF-IDF values for each word
4. Apply Complement Naive Bayes

#### Implemented Scaling - Decreased Accuracy

In [None]:
from sklearn import preprocessing
# scaler1 = preprocessing.StandardScaler(with_mean=False).fit(X_train) ## in [-2,2]
# scaler2 = preprocessing.StandardScaler(with_mean=False).fit(X_test) ## with_mean=False is used for sparse matrices
# X_train_scaled = scaler1.transform(X_train)
# X_test_scaled = scaler2.transform(X_test)

# min_max_scaler = preprocessing.MinMaxScaler() # to fit array in [0,1] range

# min_max_scaler = preprocessing.MaxAbsScaler()
# # This estimator scales and translates each feature individually such that the maximal absolute value of each feature 
# # in the training set will be 1.0. 
# # It does not shift/center the data, and thus does not destroy any sparsity.
# # This scaler can also be applied to sparse CSR or CSC matrices.

# X_train_scaled = min_max_scaler.fit_transform(X_train)
# X_test_scaled = min_max_scaler.fit_transform(X_test)

# X_train_scaled.array
# X_test_scaled.toarray()
# y_train.array
# y_test.array

X_train_scaled=X_train
#X_test_scaled=X_test

In [None]:
from sklearn.naive_bayes import BernoulliNB ## something new score - 0.7
from sklearn.naive_bayes import CategoricalNB ##sparse error 
from sklearn.naive_bayes import GaussianNB ##sparse error
from sklearn.naive_bayes import ComplementNB ## dimension mismatch - max_score - 0.933125
from sklearn.naive_bayes import MultinomialNB ## dimension mismatch - 0.8

model = ComplementNB() ## 0.7240

### ENSEMBLE LEARNING
##### Decreased Accuracy since Complement NB was way stronger than multi and Bernoulli

In [None]:
### ENSEMBLE LEARNING

# model2 = MultinomialNB() ## 0.61
# model3 = BernoulliNB() ## 0.61400
# Cnb=model1.fit(X_train_scaled, y_train)
# Mnb=model2.fit(X_train_scaled, y_train)
# Bnb=model3.fit(X_train_scaled, y_train)

# from sklearn.ensemble import VotingClassifier
# #create a dictionary of our models
# estimators=[('Complement', Cnb), ('Multinomial', Mnb), ('Bernoulli', Bnb)]
# #create our voting classifier, inputting our models
# ensemble = VotingClassifier(estimators, voting='hard')
# submission=ensemble.fit(X_train_scaled, y_train)
# ensemble.score(X_test_scaled, y_test)

In [None]:
submission=model.fit(X_train_scaled, y_train) 

In [None]:
df_test=pd.read_csv('test.csv')
## REMOVE PUNCTUATION
df_test['text']=df_test['text'].apply(remove_punctuation)
## TOKENISATION
df_test.text=df_test.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)
## LEMMATIZATION
submit=df_test['text']
submit=submit.tolist()
for i in range(len(submit)):
    submit[i]=[lemmatizer.lemmatize(word=word,pos='v') for word in submit[i]]
## REMOVE STOPWORDS
submit=pd.Series(submit)
df_test['string_text']= submit.str.join(" ")
df_test['cleaned_text'] = df_test['string_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
submit=df_test['cleaned_text']


In [None]:
n=len(submit)
corpus_submit=submit.tolist()
submit = vectorizer.transform(corpus_submit)
#submit_scaled = min_max_scaler.fit_transform(submit)
submit=submission.predict(submit)
kaggle=pd.DataFrame(columns=['id','label'])
k=np.arange(n)+1
kaggle['id']=pd.Series(k)
kaggle['label']=pd.Series(submit)
len(kaggle['id'])
kaggle.to_csv('submission_corrected.csv',index=False)

## WITHOUT SCALING - 0.73400, IMPROVES BY 0.1
## WITH SCALING, DOWN BY 0.01 FOR MAX ABS 
##               DOWN BY 0.12 FOR STANDARD SCALER  
## WITHOUT STOPWORDS - 0.654
## USING COUNT_VECTORIZER - 0.674
## with full dataset as train - 0.728 and score - 0.93
## BY REPETITION, 0.74200
## Spell checker and contractions brings down score
## CURRENT - FULL SET AND ADDED PUNCTUATIONS - increase by 0.004

In [None]:
import tensorflow as tf
from tensorflow import keras
#from tensorflow.keras import layers

In [None]:
x = tf.constant([[1., 2., 3.],
                 [4., 5., 6.]])
print(x)
print(x.shape)
print(x.dtype)
x @ tf.transpose(x)

In [None]:
tdf=pd.read_csv('play_data.csv')
y=tdf.label.values
x=np.column_stack((tdf.YearsExperience.values,tdf.Salary.values))
np.random.shuffle(x)  
#type(x)

In [None]:
#np.random.choice()
model=keras.Sequential([keras.layers.Dense(16,input_shape=(2,),activation='relu'),keras.layers.Dense(2,activation='relu')])
model.compile(optimizer='adam',loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),metrics=['accuracy'])
model.fit(x,y,batch_size=16)

In [None]:
x=X_train.toarray()
y=y_train.values
print(x.shape)

In [None]:
model=keras.Sequential([keras.layers.Dense(16,input_shape=(12107,),activation='relu'),keras.layers.Dense(6,activation='relu')])
model.compile(optimizer='adam',loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),metrics=['accuracy'])
y=new_df.label.values
model.fit(x[0:1000],y[0:1000],batch_size=1)