Import library


In [2]:
import pandas as pd
from datasets import load_dataset 

Load Huggingface dataset


In [3]:
dataset=load_dataset("imdb")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


Select Train shuffle data


In [5]:
train=dataset["train"].shuffle(seed=50).select(range(25000))
train_df=pd.DataFrame(train)
print(train_df[:5])

                                                text  label
0  The film lacks style, i mean original style. e...      0
1  A Murder investigation goes on back stage whil...      1
2  This is a film that was very well done. I had ...      1
3  I was staying in one night and got extremely b...      0
4  What's Good About It: Some inventive and genui...      1


select test data


In [None]:
test=dataset["test"].select(range(25000))
test_df=pd.DataFrame(test)
print(test_df[:5])

                                                text  label
0  I love sci-fi and am willing to put up with a ...      0
1  Worth the entertainment value of a rental, esp...      0
2  its a totally average film with a few semi-alr...      0
3  STAR RATING: ***** Saturday Night **** Friday ...      0
4  First off let me say, If you haven't enjoyed a...      0


Check label data if balance


In [7]:
print(train_df["label"].value_counts())
print(test_df["label"].value_counts())

0    12500
1    12500
Name: label, dtype: int64
0    12500
1    12500
Name: label, dtype: int64


NLTK to clean the texts


In [8]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.data.path.append("nltk_data")

stop_words=set(stopwords.words("english"))
lemmatizer=WordNetLemmatizer()
def clean_text(text):
    text=text.lower()
    text=re.sub(r"<.*?>","",text)
    text=re.sub(r"[^a-z\s]", "",text)
    words=text.split()
    clean_words=[]
    for word in words:
        if word not in stop_words:
            lemma=lemmatizer.lemmatize(word)
            clean_words.append(lemma) 
    return " ".join(clean_words) 

Apply clean text store in new column


In [9]:
train_df["clean_text"]=train_df["text"].apply(clean_text)
test_df["clean_text"]=test_df["text"].apply(clean_text)

print(train_df[:5])
print(test_df[:5])

                                                text  label  \
0  The film lacks style, i mean original style. e...      0   
1  A Murder investigation goes on back stage whil...      1   
2  This is a film that was very well done. I had ...      1   
3  I was staying in one night and got extremely b...      0   
4  What's Good About It: Some inventive and genui...      1   

                                          clean_text  
0  film lack style mean original style everything...  
1  murder investigation go back stage vanity open...  
2  film well done heard mixed review production w...  
3  staying one night got extremely bored around f...  
4  whats good inventive genuinely creepy little e...  
                                                text  label  \
0  I love sci-fi and am willing to put up with a ...      0   
1  Worth the entertainment value of a rental, esp...      0   
2  its a totally average film with a few semi-alr...      0   
3  STAR RATING: ***** Saturday Night **

Split Train and Validation


In [10]:
from sklearn.model_selection import train_test_split
x_train,x_valid,y_train,y_valid=train_test_split(train_df["clean_text"],train_df["label"],test_size=0.20,random_state=50)

print(x_train.shape)
print(x_valid.shape)
print(y_train.shape)
print(y_valid.shape)

(20000,)
(5000,)
(20000,)
(5000,)


TFIDfvectorizer to features extraction


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer(max_features=10000)

x_train_vec=vectorizer.fit_transform(x_train)
x_valid_vec=vectorizer.transform(x_valid)

x_test_vec=vectorizer.transform(test_df["clean_text"])
y_test_vec=test_df["label"]

print(x_train_vec.shape)
print(x_valid_vec.shape)
print(x_test_vec.shape)
print(y_test_vec.shape)

(20000, 10000)
(5000, 10000)
(25000, 10000)
(25000,)


Classification algorithmns (handle texts)


In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

model1=LogisticRegression(max_iter=1000)
model2=MultinomialNB()

model1.fit(x_train_vec,y_train)
model2.fit(x_train_vec,y_train)

Models and vectorizer load in pickle file


In [None]:
Models={"Logistic Regression":model1,
         "MultinomialNB":model2 }
import joblib 
with open("nlptrained.pkl","wb") as f:
    joblib.dump(Models,f)
with open ("vectorizer.pkl","wb") as f:
    joblib.dump(vectorizer,f)     

Validation accuracy


In [21]:
y_valid_pred1=model1.predict(x_valid_vec)
y_valid_pred2=model2.predict(x_valid_vec)

from sklearn.metrics import accuracy_score

val_accuracy1=accuracy_score(y_valid,y_valid_pred1)
val_accuracy2=accuracy_score(y_valid,y_valid_pred2)

print("Logistic regression validation accuracy:",val_accuracy1)
print("Naive Bayes validation accuracy:",val_accuracy2)

Logistic regression validation accuracy: 0.883
Naive Bayes validation accuracy: 0.854


Test accuracy


In [16]:
y_test_pred1=model1.predict(x_test_vec)
y_test_pred2=model2.predict(x_test_vec)

from sklearn.metrics import accuracy_score

test_accuracy1=accuracy_score(y_test_vec,y_test_pred1)
test_accuracy2=accuracy_score(y_test_vec,y_test_pred2)

print("Logistic regression test accuracy:",test_accuracy1)
print("Naive Bayes test accuracy:",test_accuracy2)

Logistic regression test accuracy: 0.8772
Naive Bayes test accuracy: 0.83404


Confusion matrix (Test)


In [17]:
from sklearn.metrics import confusion_matrix
cm1=confusion_matrix(y_test_vec,y_test_pred1)
cm2=confusion_matrix(y_test_vec,y_test_pred2)
print("Confusion matrix logistic regression:\n",cm1)
print("confusion matrix naive bayes:\n",cm2)

Confusion matrix logistic regression:
 [[10914  1586]
 [ 1484 11016]]
confusion matrix naive bayes:
 [[10849  1651]
 [ 2498 10002]]


Classification report ( Precision FP, Recall FN, F1_score )


In [18]:
from sklearn.metrics import classification_report
cr1=classification_report(y_test_vec,y_test_pred1)
cr2=classification_report(y_test_vec,y_test_pred2)
print("Logistic Report:\n ",cr1,"\n")
print("Naive bayes Report:\n",cr2)


Logistic Report:
                precision    recall  f1-score   support

           0       0.88      0.87      0.88     12500
           1       0.87      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000
 

Naive bayes Report:
               precision    recall  f1-score   support

           0       0.81      0.87      0.84     12500
           1       0.86      0.80      0.83     12500

    accuracy                           0.83     25000
   macro avg       0.84      0.83      0.83     25000
weighted avg       0.84      0.83      0.83     25000



Model Classification positive-1 and negative-0


In [19]:

new_text=[" This is a film that was very well done"]

vec=vectorizer.transform(new_text)
predict=model1.predict(vec)
if predict == 1:
    print("Positive")
else:
    print("Negative")     

Positive
