In [1]:
!pwd

/content


In [2]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

Mounted at /content/gdrive


In [3]:
import os
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [4]:
df = pd.read_csv('/content/gdrive/MyDrive/IMDB Dataset.csv')

In [5]:
df['sentiment'].value_counts()
# to check for imbalance in the data

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [6]:
df_positive = df[df['sentiment']=='positive'][:2500]
df_negative = df[df['sentiment']=='negative'][:2500]

df_mini = pd.concat([df_positive, df_negative])

In [7]:
train, test = train_test_split(df_mini, test_size = 0.30, random_state = 0)

In [8]:
train_x = train['review']
train_y = train['sentiment']
test_x = test['review']
test_y = test['sentiment']

In [9]:
train_y

691     negative
3096    positive
2883    positive
4401    positive
2768    positive
          ...   
4820    negative
1552    negative
3302    positive
185     negative
442     negative
Name: sentiment, Length: 3500, dtype: object

In [10]:
'''
TF-IDF (Term Frequency, Inverse Document Frequency)
Suppose there is some word, which is frequent in all the
documents, then if the dataset is balanced, then this word
won't be significant to any of the documents and would be 
assigned less weight.
There is some another word, which is frequent in only few
of the documents and not in others, then this word is significant
to these docs and would be assigned higher weights.
This is what this method does, for some word, it calculates the 
overall frequency and multiplies it by the inverse frequency of 
the word overall documents.
'''
tfidf = TfidfVectorizer(stop_words = 'english')
train_x_vector = tfidf.fit_transform(train_x)
train_x_vector

<3500x32996 sparse matrix of type '<class 'numpy.float64'>'
	with 312488 stored elements in Compressed Sparse Row format>

In [11]:
test_x_vector = tfidf.transform(test_x)

In [12]:
svc = SVC(kernel = 'linear')
svc.fit(train_x_vector, train_y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [13]:
print(svc.predict(tfidf.transform(['A good movie'])))
print(svc.predict(tfidf.transform(['An excellent movie'])))
print(svc.predict(tfidf.transform(['I did not like this movie at all'])))

['positive']
['positive']
['negative']


In [14]:
svc.score(test_x_vector, test_y)

0.87

In [15]:
conf_mat = confusion_matrix(test_y, 
                            svc.predict(test_x_vector), 
                            labels=['positive', 'negative'])

In [16]:
print(conf_mat)

[[645  76]
 [119 660]]


In [17]:
f1_score(test_y, svc.predict(test_x_vector),
         labels=['positive', 'negative'],
         average=None)

array([0.86868687, 0.87128713])