In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1
...,...,...,...,...,...,...,...,...
8513,,,,,,,,5
8514,,,,,,,,2
8515,,,,,,,,4
8516,,,,,,,,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8518 entries, 0 to 8517
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Reviewer Name    8508 non-null   object 
 1   Review Title     8508 non-null   object 
 2   Place of Review  8468 non-null   object 
 3   Up Votes         8508 non-null   float64
 4   Down Votes       8508 non-null   float64
 5   Month            8053 non-null   object 
 6   Review text      8510 non-null   object 
 7   Ratings          8518 non-null   int64  
dtypes: float64(2), int64(1), object(5)
memory usage: 532.5+ KB


In [5]:
df.dropna(subset=['Review text'], inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8510 entries, 0 to 8509
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Reviewer Name    8508 non-null   object 
 1   Review Title     8508 non-null   object 
 2   Place of Review  8468 non-null   object 
 3   Up Votes         8508 non-null   float64
 4   Down Votes       8508 non-null   float64
 5   Month            8053 non-null   object 
 6   Review text      8510 non-null   object 
 7   Ratings          8510 non-null   int64  
dtypes: float64(2), int64(1), object(5)
memory usage: 598.4+ KB


### Data Preprocessing

In [7]:
!pip install nltk



In [8]:
!pip install gensim



In [9]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from transformers import BertTokenizer

# Load NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


# Check for missing values in 'Review text' column
df.dropna(subset=['Review text'], inplace=True)

# Text Cleaning
def clean_text(text):
    if isinstance(text, str):
        # Remove special characters and punctuation
        text = re.sub(r'[^\w\s]', '', text)
        # Convert to lowercase
        text = text.lower()
        return text
    else:
        return ''

# Remove stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

# Apply text cleaning to Review text column
df['Cleaned Text'] = df['Review text'].apply(clean_text)
df['Cleaned Text'] = df['Cleaned Text'].apply(remove_stopwords)

# Text Normalization (Lemmatization)
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

# Apply lemmatization to cleaned text
df['Normalized Text'] = df['Cleaned Text'].apply(lemmatize_text)

# Numerical Feature Extraction
# Bag-of-Words (BoW) Model
bow_vectorizer = CountVectorizer()
bow_features = bow_vectorizer.fit_transform(df['Normalized Text'])

# Term Frequency-Inverse Document Frequency (TF-IDF) Model
tfidf_vectorizer = TfidfVectorizer()
tfidf_features = tfidf_vectorizer.fit_transform(df['Normalized Text'])

# Word2Vec Model
word2vec_model = Word2Vec(sentences=[word_tokenize(text) for text in df['Normalized Text']], vector_size=100, window=5, min_count=1, workers=4)

def get_word_vector(word):
    if word in word2vec_model.wv:
        return word2vec_model.wv[word]
    else:
        return np.zeros(100)  # Return zero vector if word not found

word2vec_features = np.array([
    np.mean([get_word_vector(word) for word in word_tokenize(text)], axis=0)
    for text in df['Normalized Text']
])

# BERT Model
bert_tokenized_texts = df['Normalized Text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
bert_max_len = max(map(len, bert_tokenized_texts))
bert_padded_texts = np.array([text + [0] * (bert_max_len - len(text)) for text in bert_tokenized_texts])


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rameshbabu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rameshbabu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rameshbabu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/rameshbabu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# Assuming you have a target variable 'Sentiment' in your dataframe

X_train, X_test, y_train, y_test = train_test_split(bow_features, df['Ratings'], test_size=0.2, random_state=42)

# Logistic Regression Model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Evaluate the model
y_pred = lr_model.predict(X_test)

f1_score_lr = f1_score(y_test, y_pred, average='weighted')
print("F1 Score (Logistic Regression):", f1_score_lr)

F1 Score (Logistic Regression): 0.5591318931811269


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
!pip install joblib



In [12]:
import joblib

# Save the model to a file
joblib.dump(lr_model, 'lr_model.pkl')

['lr_model.pkl']

In [13]:
# Save the CountVectorizer object
joblib.dump(bow_vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# Assuming you have a target variable 'Sentiment' in your dataframe

X_train, X_test, y_train, y_test = train_test_split(tfidf_features, df['Ratings'], test_size=0.2, random_state=42)

# Logistic Regression Model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Evaluate the model
y_pred = lr_model.predict(X_test)

f1_score_lr = f1_score(y_test, y_pred, average='weighted')
print("F1 Score (Logistic Regression):", f1_score_lr)

In [15]:
# Assuming you have a target variable 'Sentiment' in your dataframe

X_train, X_test, y_train, y_test = train_test_split(word2vec_features, df['Ratings'], test_size=0.2, random_state=42)

# Logistic Regression Model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Evaluate the model
y_pred = lr_model.predict(X_test)

f1_score_lr = f1_score(y_test, y_pred, average='weighted')
print("F1 Score (Logistic Regression):", f1_score_lr)

In [16]:
from sklearn.svm import SVC

svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred_svm = svm_model.predict(X_test)
f1_score_svm = f1_score(y_test, y_pred_svm,average='weighted')
print("F1 Score (SVM):", f1_score_svm)

In [17]:
from sklearn.svm import SVC

svm_model = SVC(kernel='rbf')
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred_svm = svm_model.predict(X_test)
f1_score_svm = f1_score(y_test, y_pred_svm,average='weighted')
print("F1 Score (SVM):", f1_score_svm)

In [18]:
from sklearn.svm import SVC

svm_model = SVC(kernel='sigmoid')
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred_svm = svm_model.predict(X_test)
f1_score_svm = f1_score(y_test, y_pred_svm,average='weighted')
print("F1 Score (SVM):", f1_score_svm)

In [None]:
from sklearn.svm import SVC

svm_model = SVC(kernel='poly', degree = 10)
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred_svm = svm_model.predict(X_test)
f1_score_svm = f1_score(y_test, y_pred_svm,average='weighted')
print("F1 Score (SVM):", f1_score_svm)

#### Neural Networks

In [None]:
# Import necessary libraries
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(bow_features, df['Ratings'], test_size=0.2, random_state=42)

X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

# Build the Deep Neural Network model
model = Sequential()
model.add(Dense(512, input_shape=(X_train_dense.shape[1],), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_dense, y_train, epochs=10, batch_size=32, validation_data=(X_test_dense, y_test))

# Evaluate the model using F1-Score
y_pred = model.predict(X_test_dense)
y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]
f1 = f1_score(y_test, y_pred_binary, average = 'weighted')
print("F1-Score:", f1)

In [None]:
model.save("my_model.pkl")
