##Build a Sentiment Analysis Model for IMDB Movie Review Dataset


In [2]:
import nltk
nltk.download('stopwords')

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
df = pd.read_csv('/content/drive/MyDrive/Concepts of technologies and AI/IMDB Dataset.csv')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:

def preprocess_text(text):

    text = text.lower()

    text = re.sub(r'[^a-z\s]', '', text)

    words = text.split()

    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    return ' '.join(words)

In [6]:

df['review'] = df['review'].apply(preprocess_text)

In [7]:
from sklearn.model_selection import train_test_split

X = df['review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

vectorizer = CountVectorizer()

X_train_vec = vectorizer.fit_transform(X_train)

model = MultinomialNB()
model.fit(X_train_vec, y_train)

X_test_vec = vectorizer.transform(X_test)


In [9]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, roc_auc_score

y_pred = model.predict(X_test_vec)

y_test_num = y_test.map({'negative': 0, 'positive': 1})
y_pred_num = pd.Series(y_pred).map({'negative': 0, 'positive': 1})

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

precision, recall, f1, _ = precision_recall_fscore_support(y_test_num, y_pred_num, average='binary')
print(f"Precision: {precision}, Recall: {recall}, F1-score: {f1}")

conf_matrix = confusion_matrix(y_test_num, y_pred_num)
print(f"Confusion Matrix:\n{conf_matrix}")

roc_auc = roc_auc_score(y_test_num, model.predict_proba(X_test_vec)[:, 1])
print(f"ROC-AUC Score: {roc_auc}")

Accuracy: 0.8555
Precision: 0.8691454396055875, Recall: 0.8396507243500695, F1-score: 0.8541435348743313
Confusion Matrix:
[[4324  637]
 [ 808 4231]]
ROC-AUC Score: 0.9224501018641973


##4.1 Feature Selection using Wrapper Methods.


In [10]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Concepts of technologies and AI/Breast Cancer Prognostic.csv')

df.head()


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [11]:

df.describe()



Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,0.0
mean,30371830.0,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,...,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946,
std,125020600.0,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,...,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061,
min,8670.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,...,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504,
25%,869218.0,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,...,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146,
50%,906024.0,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,...,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004,
75%,8813129.0,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,...,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208,
max,911320500.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,...,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075,


In [12]:
df.isnull().sum()

Unnamed: 0,0
id,0
diagnosis,0
radius_mean,0
texture_mean,0
perimeter_mean,0
area_mean,0
smoothness_mean,0
compactness_mean,0
concavity_mean,0
concave points_mean,0


In [13]:

df = df.dropna()




In [14]:
df.fillna(df.mean(), inplace=True)

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('/content/drive/MyDrive/Concepts of technologies and AI/Breast Cancer Prognostic.csv')

# Select only numeric columns for calculating the mean
numeric_df = df.select_dtypes(include=['number'])

# Impute missing values in numeric columns with the mean of those columns
df[numeric_df.columns] = numeric_df.fillna(numeric_df.mean())

X = df.drop(columns=['diagnosis'])
y = df['diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:

conf_matrix_all = confusion_matrix(y_test, y_pred_all)
conf_matrix_selected = confusion_matrix(y_test, y_pred_selected)

print(f"Confusion Matrix (All Features):\n{conf_matrix_all}")
print(f"Confusion Matrix (Selected Features):\n{conf_matrix_selected}")


In [None]:

for n_features in [3, 7]:
    rfe = RFE(estimator=log_reg, n_features_to_select=n_features)
    rfe.fit(X_train, y_train)

    selected_features = X_train.columns[rfe.support_]
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]

    log_reg.fit(X_train_selected, y_train)
    y_pred_selected = log_reg.predict(X_test_selected)

    accuracy_selected = accuracy_score(y_test, y_pred_selected)
    precision_selected, recall_selected, f1_selected, _ = precision_recall_fscore_support(y_test, y_pred_selected, average='binary')
    roc_auc_selected = roc_auc_score(y_test, log_reg.predict_proba(X_test_selected)[:, 1])

    print(f"Top {n_features} Features - Accuracy: {accuracy_selected}, Precision: {precision_selected}, Recall: {recall_selected}, F1-score: {f1_selected}, ROC-AUC: {roc_auc_selected}")
