In [2]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv('../../12_data/moviereviews.tsv', sep='\t')

In [5]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [6]:
len(df)

2000

In [7]:
print(df['review'][2])

this has been an extraordinary year for australian films . 
 " shine " has just scooped the pool at the australian film institute awards , picking up best film , best actor , best director etc . to that we can add the gritty " life " ( the anguish , courage and friendship of a group of male prisoners in the hiv-positive section of a jail ) and " love and other catastrophes " ( a low budget gem about straight and gay love on and near a university campus ) . 
i can't recall a year in which such a rich and varied celluloid library was unleashed from australia . 
 " shine " was one bookend . 
stand by for the other one : " dead heart " . 
>from the opening credits the theme of division is established . 
the cast credits have clear and distinct lines separating their first and last names . 
bryan | brown . 
in a desert settlement , hundreds of kilometres from the nearest town , there is an uneasy calm between the local aboriginals and the handful of white settlers who live nearby . 
the loc

In [8]:
df.isnull().sum()

label      0
review    35
dtype: int64

In [9]:
#df.dropna(inplace=True) removes all rows from the pandas DataFrame df that contain missing values (NA or NaN).
df.dropna(inplace=True)

In [10]:
df.isnull().sum()

label     0
review    0
dtype: int64

In [11]:
mystring ='hello'
empty = ' '

In [12]:
#isspace() method checks if a string consists only of whitespace characters (spaces, tabs, newlines).
empty.isspace()

True

In [13]:
blanks = [i for i, lb, rv in df.itertuples(index=True) if rv.isspace()]


In [14]:
blanks

[57,
 71,
 147,
 151,
 283,
 307,
 313,
 323,
 343,
 351,
 427,
 501,
 633,
 675,
 815,
 851,
 977,
 1079,
 1299,
 1455,
 1493,
 1525,
 1531,
 1763,
 1851,
 1905,
 1993]

In [15]:
df.drop(blanks, inplace=True)

In [16]:
len(df)

1938

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X = df['review']

In [19]:
y = df['label']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [21]:
#Pipeline: Allows chaining multiple steps (like preprocessing, feature extraction, and model training) into a single sequential pipeline.
#TfidfVectorizer: Converts raw text into TF-IDF vectors (Term Frequency-Inverse Document Frequency), which numerically represent the importance of words in the documents.
#LinearSVC: A linear Support Vector Classifier. It is a powerful linear model often used for text classification tasks.

In [22]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [23]:
#step 1 :abeled 'tfidf', applies TfidfVectorizer() to convert raw text data into TF-IDF feature vectors.
#step 2 : abeled 'clf', applies a LinearSVC() classifier for training and prediction.
text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [24]:
text_clf.fit(X_train, y_train)

In [25]:
predictions = text_clf.predict(X_test)

In [26]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [27]:
print(confusion_matrix, y_test, predictions)

<function confusion_matrix at 0x11d5c0a40> 600     pos
931     pos
937     pos
1811    neg
1512    neg
       ... 
1675    neg
798     neg
1108    neg
1705    neg
771     pos
Name: label, Length: 640, dtype: object ['neg' 'pos' 'pos' 'neg' 'neg' 'neg' 'neg' 'pos' 'neg' 'neg' 'neg' 'neg'
 'pos' 'neg' 'pos' 'neg' 'neg' 'pos' 'neg' 'pos' 'neg' 'neg' 'neg' 'neg'
 'pos' 'pos' 'neg' 'pos' 'pos' 'pos' 'neg' 'pos' 'neg' 'pos' 'neg' 'pos'
 'pos' 'neg' 'pos' 'pos' 'neg' 'pos' 'neg' 'pos' 'neg' 'neg' 'neg' 'pos'
 'pos' 'pos' 'neg' 'neg' 'neg' 'pos' 'neg' 'neg' 'pos' 'pos' 'pos' 'pos'
 'neg' 'pos' 'neg' 'pos' 'neg' 'pos' 'pos' 'pos' 'pos' 'neg' 'neg' 'pos'
 'neg' 'pos' 'pos' 'pos' 'pos' 'pos' 'neg' 'pos' 'pos' 'pos' 'pos' 'neg'
 'neg' 'pos' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'pos'
 'pos' 'neg' 'pos' 'neg' 'pos' 'pos' 'neg' 'pos' 'pos' 'pos' 'neg' 'pos'
 'pos' 'neg' 'pos' 'neg' 'neg' 'neg' 'neg' 'neg' 'pos' 'neg' 'neg' 'neg'
 'neg' 'neg' 'pos' 'pos' 'neg' 'pos' 'pos' 'pos' 'pos' 

In [29]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Assume predictions are already made
# predictions = text_clf.predict(X_test)

# 1. Print the confusion matrix
cm = confusion_matrix(y_test, predictions)
print("Confusion Matrix:")
print(cm)

# 2. Print the classification report (precision, recall, F1-score, etc.)
cr = classification_report(y_test, predictions)
print("\nClassification Report:")
print(cr)

# 3. Print the accuracy score
acc = accuracy_score(y_test, predictions)
print(f"\nAccuracy: {acc:.4f}")


Confusion Matrix:
[[259  49]
 [ 49 283]]

Classification Report:
              precision    recall  f1-score   support

         neg       0.84      0.84      0.84       308
         pos       0.85      0.85      0.85       332

    accuracy                           0.85       640
   macro avg       0.85      0.85      0.85       640
weighted avg       0.85      0.85      0.85       640


Accuracy: 0.8469
