## IMDB DATASET

In [1]:
import pandas as pd

In [2]:
df_imdb = pd.read_csv("IMDB_Dataset.csv")

In [3]:
df_imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df_imdb.loc[df_imdb['sentiment'] == 'positive', 'sentiment'] = 1
df_imdb.loc[df_imdb['sentiment'] == 'negative', 'sentiment'] = 0

In [5]:
df_imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


## Amazon Instrument Reviews

In [6]:
df_amazon_music = pd.read_csv("Musical_instruments_reviews.csv")

In [7]:
df_amazon_music.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014"


In [8]:
df_amazon_music = df_amazon_music.drop(['reviewerID', 'asin', 'reviewerName', 'helpful', 'summary', 'unixReviewTime', 'reviewTime'], axis = 1)

In [9]:
df_amazon_music.head()

Unnamed: 0,reviewText,overall
0,"Not much to write about here, but it does exac...",5.0
1,The product does exactly as it should and is q...,5.0
2,The primary job of this device is to block the...,5.0
3,Nice windscreen protects my MXL mic and preven...,5.0
4,This pop filter is great. It looks and perform...,5.0


In [10]:
df_amazon_music['overall'] = df_amazon_music['overall'].apply(lambda x: 1 if x > 3.0 else 0)

In [11]:
df_amazon_music.head()

Unnamed: 0,reviewText,overall
0,"Not much to write about here, but it does exac...",1
1,The product does exactly as it should and is q...,1
2,The primary job of this device is to block the...,1
3,Nice windscreen protects my MXL mic and preven...,1
4,This pop filter is great. It looks and perform...,1


In [12]:
df_amazon_music.rename(columns={'reviewText' : 'review', 'overall' : 'sentiment'}, inplace = True)

In [13]:
df_amazon_music.head()

Unnamed: 0,review,sentiment
0,"Not much to write about here, but it does exac...",1
1,The product does exactly as it should and is q...,1
2,The primary job of this device is to block the...,1
3,Nice windscreen protects my MXL mic and preven...,1
4,This pop filter is great. It looks and perform...,1


## Concatenating Datasets

In [14]:
df_imdb_negative = df_imdb[df_imdb['sentiment'] == 0].head(2000)
df_imdb_positive = df_imdb[df_imdb['sentiment'] == 1].head(2000)

In [15]:
df_amazon_music_negative = df_amazon_music[df_amazon_music['sentiment'] == 0].head(2000)
df_amazon_music_positive = df_amazon_music[df_amazon_music['sentiment'] == 1].head(2000)

In [16]:
df_final = pd.concat([df_imdb_negative, df_imdb_positive, df_amazon_music_negative, df_amazon_music_positive], ignore_index=True, sort=False)

In [17]:
df_final.head()

Unnamed: 0,review,sentiment
0,Basically there's a family where a little boy ...,0
1,"This show was an amazing, fresh & innovative i...",0
2,Encouraged by the positive comments about this...,0
3,Phil the Alien is one of those quirky films wh...,0
4,I saw this movie when I was about 12 when it c...,0


## Sentiment Analysis

In [18]:
import nltk

In [19]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker
import string
from tqdm import tqdm

In [20]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\puran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\puran\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\puran\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [21]:
def preprocessing(text):

    token = word_tokenize(text)
    token = [word for word in token if word.isalpha()]
    words_no_punctuation = [word.lower() for word in token if word not in string.punctuation]

    stop_words = set(stopwords.words('english'))
    words_no_stop_words = [word for word in words_no_punctuation if word not in stop_words]

    lemmatizer = WordNetLemmatizer()
    words_lemmatized = [lemmatizer.lemmatize(word) for word in words_no_stop_words]
    
    return words_lemmatized

In [22]:
reviews = df_final['review']
processed_reviews = reviews.apply(lambda i: preprocessing(i))
df_final['processed'] = processed_reviews
df_final.head()

Unnamed: 0,review,sentiment,processed
0,Basically there's a family where a little boy ...,0,"[basically, family, little, boy, jake, think, ..."
1,"This show was an amazing, fresh & innovative i...",0,"[show, amazing, fresh, innovative, idea, first..."
2,Encouraged by the positive comments about this...,0,"[encouraged, positive, comment, film, looking,..."
3,Phil the Alien is one of those quirky films wh...,0,"[phil, alien, one, quirky, film, humour, based..."
4,I saw this movie when I was about 12 when it c...,0,"[saw, movie, came, recall, scariest, scene, bi..."


In [23]:
df_final['processed'] = df_final['processed'].apply(lambda tokens : ' '.join(tokens))
df_final['processed']

0       basically family little boy jake think zombie ...
1       show amazing fresh innovative idea first aired...
2       encouraged positive comment film looking forwa...
3       phil alien one quirky film humour based around...
4       saw movie came recall scariest scene big bird ...
                              ...                        
7234    microphone amazon may best mic home studio one...
7235    love product came fast included extra shockmou...
7236    mxl sound every bit good expensive mike used l...
7237    took figure implement mic right equipment turn...
7238    two perfect home studio vocal step dynamic mic...
Name: processed, Length: 7239, dtype: object

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectroizer = TfidfVectorizer()

In [28]:
x = vectroizer.fit_transform(df_final['processed']).toarray()
y = df_final['sentiment'].astype(int)

In [29]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [30]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, y_train)

In [32]:
from sklearn.metrics import accuracy_score
y_pred = lr.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8259668508287292
