<a href="https://colab.research.google.com/github/riyaaaarane/expense/blob/main/sentimentalAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
df = pd.read_csv('/content/train.csv', encoding='ISO-8859-1')

In [3]:
df.head()

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,1,Spooky's Jump Scare Mansion,2016.0,I'm scared and hearing creepy voices. So I'll...,1
1,2,Spooky's Jump Scare Mansion,2016.0,"Best game, more better than Sam Pepper's YouTu...",1
2,3,Spooky's Jump Scare Mansion,2016.0,"A littly iffy on the controls, but once you kn...",1
3,4,Spooky's Jump Scare Mansion,2015.0,"Great game, fun and colorful and all that.A si...",1
4,5,Spooky's Jump Scare Mansion,2015.0,Not many games have the cute tag right next to...,1


In [4]:
df.shape

(17494, 5)

In [5]:
df.isnull().sum()

Unnamed: 0,0
review_id,0
title,0
year,178
user_review,0
user_suggestion,0


In [6]:
df['year'].fillna(df['year'].mean())
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17494 entries, 0 to 17493
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   review_id        17494 non-null  int64  
 1   title            17494 non-null  object 
 2   year             17316 non-null  float64
 3   user_review      17494 non-null  object 
 4   user_suggestion  17494 non-null  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 683.5+ KB


In [7]:
df.drop('year',axis=1,inplace=True)


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17494 entries, 0 to 17493
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   review_id        17494 non-null  int64 
 1   title            17494 non-null  object
 2   user_review      17494 non-null  object
 3   user_suggestion  17494 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 546.8+ KB


In [9]:
df['user_suggestion'].value_counts()

Unnamed: 0_level_0,count
user_suggestion,Unnamed: 1_level_1
1,9968
0,7526


In [10]:
port_stem = PorterStemmer()

In [29]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [30]:
df['stemmed_content'] = df['user_review'].apply(stemming)

In [31]:
df.head()

Unnamed: 0,review_id,title,user_review,user_suggestion,stemmed_content
0,1,Spooky's Jump Scare Mansion,I'm scared and hearing creepy voices. So I'll...,1,scare hear creepi voic paus moment write revie...
1,2,Spooky's Jump Scare Mansion,"Best game, more better than Sam Pepper's YouTu...",1,best game better sam pepper youtub account nee...
2,3,Spooky's Jump Scare Mansion,"A littly iffy on the controls, but once you kn...",1,littli iffi control know play easi master made...
3,4,Spooky's Jump Scare Mansion,"Great game, fun and colorful and all that.A si...",1,great game fun color side note though get wind...
4,5,Spooky's Jump Scare Mansion,Not many games have the cute tag right next to...,1,mani game cute tag right next horror tag steam...


In [32]:
print(df['stemmed_content'])

0        scare hear creepi voic paus moment write revie...
1        best game better sam pepper youtub account nee...
2        littli iffi control know play easi master made...
3        great game fun color side note though get wind...
4        mani game cute tag right next horror tag steam...
                               ...                        
17489    arguabl singl greatest mmorp exist today free ...
17490    older game sure charm hold special place heart...
17491    frist start play everquest amaz still great pl...
17492    cool game thing realli piss ridabl transport t...
17493    game sinc littl kid alway sinc still rememb fi...
Name: stemmed_content, Length: 17494, dtype: object


In [33]:
print(df['user_suggestion'])

0        1
1        1
2        1
3        1
4        1
        ..
17489    1
17490    1
17491    1
17492    1
17493    1
Name: user_suggestion, Length: 17494, dtype: int64


In [34]:
X = df['stemmed_content'].values
Y = df['user_suggestion'].values

In [35]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [36]:
print(X.shape,X_train.shape,X_test.shape)

(17494,) (13995,) (3499,)


In [41]:
vectorizer = TfidfVectorizer()
# Fit the vectorizer to the training data
X_train = vectorizer.fit_transform(X_train)
# Transform the test data using the fitted vectorizer
X_test = vectorizer.transform(X_test)

In [42]:
model = LogisticRegression(max_iter=1000)

In [43]:
model.fit(X_train,Y_train)

In [44]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction,Y_train)

In [45]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9021078956770275


In [46]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [47]:
print('Accuracy score of the testing data : ', test_data_accuracy)

Accuracy score of the testing data :  0.8428122320663046


In [48]:
import pickle

In [50]:
filename = 'trained_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [None]:
loaded_model = pickle.load(open('/content/trained_model.sav', 'rb'))