In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.metrics import accuracy_score, classification_report
import pickle

In [8]:
df = pd.read_csv('../resources/train.csv', header=None)
df.columns = ['Polarity','Product','Review']

In [9]:
df

Unnamed: 0,Polarity,Product,Review
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
...,...,...,...
3599995,1,Don't do it!!,The high chair looks great when it first comes...
3599996,1,"Looks nice, low functionality",I have used this highchair for 2 kids now and ...
3599997,1,"compact, but hard to clean","We have a small house, and really wanted two o..."
3599998,1,what is it saying?,not sure what this book is supposed to be. It ...


In [10]:
sample_proportion = 0.0001  # Represents 30%
sample_size = int(sample_proportion * len(df))  # Ensure integer size

# Create the stratified shuffle split object with the correct sample size
sss = StratifiedShuffleSplit(n_splits=1, test_size=sample_size, random_state=42)

# Split the data using stratified sampling
for train_index, test_index in sss.split(df, df["Polarity"]):
    sampled_data = df.iloc[test_index]

sampled_data

Unnamed: 0,Polarity,Product,Review
1857688,1,Warning- Sharp edges!!,The edges on the chairs are all sharp and not ...
2516367,2,great dvd,my son loves power rangers so i purched all mo...
1135058,1,Unreliable,I bought this device about 6 months ago. It's ...
2398657,2,kingmom4,Very pretty picture. Arrived in short amount o...
665019,2,A Much Needed Resource,For parents (or anyone) seeking answers on thi...
...,...,...,...
2582896,1,Eat nuts because Squirrels do,I like that he offers a different concept in t...
3093304,2,The best Historical based Fiction I've ever read.,Diana Gabaldon has managed to awaken interests...
3145713,1,Wrong size for me,It looked to me like this size gasket would wo...
3387233,1,Junk,Bought this for a rental house in the end of 2...


In [11]:
def polarity_optimisation(num):
    if(num == 2):
        return 0
    else:
        return 1

In [12]:
def pre_processor(corpus):
    # Optimize polarity (replace with appropriate logic)
    corpus.loc[:, 'Polarity'] = corpus['Polarity'].apply(polarity_optimisation)

    # Remove whitespace
    corpus.loc[:, 'Review'] = corpus['Review'].apply(lambda x: x.strip())

    # Convert to lowercase
    corpus.loc[:, 'Review'] = corpus['Review'].apply(lambda x: x.lower())

    # Remove non-English characters
    pattern_english = re.compile(r'[^a-zA-Z\s]')
    corpus.loc[:, 'Review'] = corpus['Review'].apply(lambda x: re.sub(pattern_english, '', x))

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    corpus.loc[:, 'Review'] = corpus['Review'].apply(
        lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stop_words])
    )

    # Apply stemming
    stemmer = PorterStemmer()
    corpus.loc[:, 'Review'] = corpus['Review'].apply(
        lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)])
    )

    return corpus

In [13]:
processed_dataset = pre_processor(sampled_data)
processed_dataset

Unnamed: 0,Polarity,Product,Review
1857688,1,Warning- Sharp edges!!,edg chair sharp round month old scratch back l...
2516367,0,great dvd,son love power ranger purch song ill keep buy
1135058,1,Unreliable,bought devic month ago unreli inconsist someti...
2398657,0,kingmom4,pretti pictur arriv short amount time damag co...
665019,0,A Much Needed Resource,parent anyon seek answer effect rel unknown ty...
...,...,...,...
2582896,1,Eat nuts because Squirrels do,like offer differ concept relat us winter spri...
3093304,0,The best Historical based Fiction I've ever read.,diana gabaldon manag awaken interest never kne...
3145713,1,Wrong size for me,look like size gasket would work fagor pressur...
3387233,1,Junk,bought rental hous end stop work two month ago...


In [14]:
# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(df['Review'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, df['Polarity'], test_size=0.2, random_state=42)

# Train the Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)


In [15]:
# Read the test dataset
test_corpus = pd.read_csv('../resources/test.csv', header=None, nrows=40000)
test_corpus.columns = ['Polarity', 'Product', 'Review']  # Assuming this is correct
test_corpus = test_corpus[['Polarity', 'Review']].reset_index(drop=True)

In [23]:
# Define the preprocessing function
def pre_processor(corpus):
    # Remove whitespace
    corpus['Review'] = corpus['Review'].str.strip()

    # Convert to lowercase
    corpus['Review'] = corpus['Review'].str.lower()

    # Remove non-English characters
    pattern_english = re.compile(r'[^a-zA-Z\s]')
    corpus['Review'] = corpus['Review'].apply(lambda x: re.sub(pattern_english, '', x))

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    corpus['Review'] = corpus['Review'].apply(
        lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stop_words])
    )

    # Apply stemming
    stemmer = PorterStemmer()
    corpus['Review'] = corpus['Review'].apply(
        lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)])
    )

    return corpus

In [26]:
test_corpus.head(10)

Unnamed: 0,Polarity,Review
0,0,love pat one great voic gener listen cd year s...
1,0,despit fact play small portion game music hear...
2,1,bought charger jul work ok design nice conveni...
3,0,check maha energi websit powerex mhcf charger ...
4,0,review quit bit combo player hesit due unfavor...
5,1,also began incorrect disc problem ive read vcr...
6,1,love style coupl year dvd give problem doesnt ...
7,1,scroll dvd menu set vertic triangl key select ...
8,0,exot tale orient dr shen fu weird tale magazin...
9,1,firstlyi enjoy format tone book author address...


In [17]:
# Preprocessing test data
test_corpus = pre_processor(test_corpus)
test_corpus['Polarity'] = test_corpus['Polarity'].apply(polarity_optimisation)

In [24]:
# Make predictions on test data
X_test_tfidf = vectorizer.transform(test_corpus['Review'])

predictions = model.predict(X_test_tfidf)

y_test = test_corpus['Polarity']
y_pred = model.predict(X_test_tfidf)

In [25]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.364175

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00     20359
           1       0.74      0.74      0.74     19641
           2       0.00      0.00      0.00         0

    accuracy                           0.36     40000
   macro avg       0.25      0.25      0.25     40000
weighted avg       0.36      0.36      0.36     40000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [20]:
def sentiment_analysis(text):
  text = "This movie was fantastic!"
  new_features = vectorizer.transform([text])
  prediction = model.predict(new_features)[0]

  if prediction == "positive":
    print("Positive")
  else:
    print("Negative")

In [21]:
sentiment_analysis("terrible product")

Negative


In [22]:
sentiment_analysis("cool, i bough it for 3 times!")

Negative


In [31]:
pickle_out = open("../resource/output/vectorizer.pkl", "wb")
pickle.dump(vectorizer, pickle_out)
pickle_out.close()

pickle_out = open("../resource/output/bayesian.pkl", "wb")
pickle.dump(model, pickle_out)
pickle_out.close()