#Data Fitiing

Ensuring the data frame fits our requirement.



In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path = '/content/drive/MyDrive/Sentiment/D1.csv'  #file path
frame1 = pd.read_csv(file_path, encoding='utf-8') # Read the CSV file into a DataFrame

In [None]:
frame1.shape

(40000, 2)

In [None]:
frame1

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1
...,...,...
39995,"""Western Union"" is something of a forgotten cl...",1
39996,This movie is an incredible piece of work. It ...,1
39997,My wife and I watched this movie because we pl...,0
39998,"When I first watched Flatliners, I was amazed....",1


In [None]:
frame1.columns

Index(['text', 'label'], dtype='object')

In [None]:
frame1.columns=['review','sentiment']

In [None]:
frame1.columns

Index(['review', 'sentiment'], dtype='object')

In [None]:
# Define the custom labels
positive = 'Positive'
negative = 'Negative'

# Replace values in the 'sentiment' column with custom labels
frame1['sentiment'] = frame1['sentiment'].replace({1: positive, 0: negative})

In [None]:
frame1

Unnamed: 0,review,sentiment
0,I grew up (b. 1965) watching and loving the Th...,Negative
1,"When I put this movie in my DVD player, and sa...",Negative
2,Why do people who do not know what a particula...,Negative
3,Even though I have great interest in Biblical ...,Negative
4,Im a die hard Dads Army fan and nothing will e...,Positive
...,...,...
39995,"""Western Union"" is something of a forgotten cl...",Positive
39996,This movie is an incredible piece of work. It ...,Positive
39997,My wife and I watched this movie because we pl...,Negative
39998,"When I first watched Flatliners, I was amazed....",Positive


In [None]:
frame1.isnull().any().sum()

0

In [None]:
frame1.describe()

Unnamed: 0,review,sentiment
count,40000,40000
unique,39723,2
top,"Hilarious, clean, light-hearted, and quote-wor...",Negative
freq,4,20019


In [None]:
frame1['sentiment'].value_counts()

Negative    20019
Positive    19981
Name: sentiment, dtype: int64

# Data Cleaning

Check lis for cleaning data for sentiment analysis
1. Text Lowercasing (done)
2. Special Character Removal (done)
3. Punctuation Removal (done)
4. Number Removal (done)
5. Handling Emojis and Special Characters (done)
6. Whitespace Removal (done)
7. Handling URLs (done)
8. Removing user names @ (done)



In [None]:
import re # Import the re module for working with regular expressions

In [None]:
def noiseremovel_text(review):
    review = review.lower() # Used to convert to lowercase
    review = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', review) # Remove URLs
    review = re.sub(r"[^a-z\s@]", "", review)  # Remove special characters
    review = re.sub(r"\d+", "", review) # Remove numbers
    review = re.sub(r"@", "", review) # Remove @ symbols
    review = re.sub(r"[^\w\s]", "", review) # Remove punctuation
    review = re.sub(r'[^\x00-\x7F]+', '', review) # Remove emojis
    review = re.sub(r'\s+', ' ', review).strip() # Remove extra whitespaces and trim
    return review

In [None]:
frame1['review'] = frame1['review'].apply(noiseremovel_text)

In [None]:
frame1

Unnamed: 0,review,sentiment
0,i grew up b watching and loving the thunderbir...,Negative
1,when i put this movie in my dvd player and sat...,Negative
2,why do people who do not know what a particula...,Negative
3,even though i have great interest in biblical ...,Negative
4,im a die hard dads army fan and nothing will e...,Positive
...,...,...
39995,western union is something of a forgotten clas...,Positive
39996,this movie is an incredible piece of work it e...,Positive
39997,my wife and i watched this movie because we pl...,Negative
39998,when i first watched flatliners i was amazed i...,Positive


# Stemming

Stemming is a text normalization technique. Information retrieval to reduce words to their base or root form.

The main purpose of stemming is to group words with similar meanings together, even if they are inflected or derived forms of the same word.


In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer


In [None]:
def stemmer(text):
  ps = nltk.porter.PorterStemmer()
  text = '.'.join([ps.stem(word) for word in text.split()])
  return text

In [None]:
frame1['review'] = frame1['review'].apply(stemmer)

In [None]:
frame1

Unnamed: 0,review,sentiment
0,i.grew.up.b.watch.and.love.the.thunderbird.all...,Negative
1,when.i.put.thi.movi.in.my.dvd.player.and.sat.d...,Negative
2,whi.do.peopl.who.do.not.know.what.a.particular...,Negative
3,even.though.i.have.great.interest.in.biblic.mo...,Negative
4,im.a.die.hard.dad.armi.fan.and.noth.will.ever....,Positive
...,...,...
39995,western.union.is.someth.of.a.forgotten.classic...,Positive
39996,thi.movi.is.an.incred.piec.of.work.it.explor.e...,Positive
39997,my.wife.and.i.watch.thi.movi.becaus.we.plan.to...,Negative
39998,when.i.first.watch.flatlin.i.wa.amaz.it.had.al...,Positive


# Tokenization

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import ToktokTokenizer

In [None]:
# Download the necessary data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
tokenizers = ToktokTokenizer() # Create a ToktokTokenizer instance

In [None]:
# Function to remove stopwords from text
def remove_stopwords(text, stop_words=None, is_lower_case=False):
    # Use a regular expression to split the text into words while preserving consecutive full-stop signs
    words = re.findall(r'\w+|\.\.+', text)

    # Remove stopwords
    if stop_words is None:
        stop_words = set()
    else:
        stop_words = set(stop_words)

    if is_lower_case:
        # Remove stopwords without converting tokens to lowercase
        filtokens = [i for i in words if i not in stop_words]
    else:
        # Remove stopwords after converting tokens to lowercase
        filtokens = [i for i in words if i.lower() not in stop_words]

    # Join the filtered words back into a sentence
    filtered_text = ' '.join(filtokens)
    return filtered_text

# Get the English stopwords
stop_wr = set(stopwords.words('english'))

# Apply the remove_stopwords function to the 'review' column of the DataFrame
frame1['review'] = frame1['review'].apply(remove_stopwords, stop_words=stop_wr)

In [None]:
frame1

Unnamed: 0,review,sentiment
0,grew b watch love thunderbird mate school watc...,Negative
1,put thi movi dvd player sat coke chip expect w...,Negative
2,whi peopl know particular time past wa like fe...,Negative
3,even though great interest biblic movi wa bore...,Negative
4,im die hard dad armi fan noth ever chang got t...,Positive
...,...,...
39995,western union someth forgotten classic western...,Positive
39996,thi movi incred piec work explor everi nook cr...,Positive
39997,wife watch thi movi becaus plan visit sicili s...,Negative
39998,first watch flatlin wa amaz necessari featur g...,Positive


# Sorting Data (small dataset)

In [None]:
frame1

Unnamed: 0,review,sentiment
0,grew b watch love thunderbird mate school watc...,Negative
1,put thi movi dvd player sat coke chip expect w...,Negative
2,whi peopl know particular time past wa like fe...,Negative
3,even though great interest biblic movi wa bore...,Negative
4,im die hard dad armi fan noth ever chang got t...,Positive
...,...,...
39995,western union someth forgotten classic western...,Positive
39996,thi movi incred piec work explor everi nook cr...,Positive
39997,wife watch thi movi becaus plan visit sicili s...,Negative
39998,first watch flatlin wa amaz necessari featur g...,Positive


In [None]:
frame1 = frame1.iloc[:5000]


In [None]:
frame1

Unnamed: 0,review,sentiment
0,grew b watch love thunderbird mate school watc...,Negative
1,put thi movi dvd player sat coke chip expect w...,Negative
2,whi peopl know particular time past wa like fe...,Negative
3,even though great interest biblic movi wa bore...,Negative
4,im die hard dad armi fan noth ever chang got t...,Positive
...,...,...
4995,absolut putrid slasher film ha one redeem qual...,Negative
4996,one franc farmer earliest movi absolut beauti ...,Negative
4997,rent thi think would pretti good cover movi ca...,Negative
4998,went thi movi becaus husband enjoy origin vers...,Negative


# Feature Extraction


In [None]:
import sklearn

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfModel = TfidfVectorizer()
print(tfidfModel.fit_transform(frame1["review"]))


  (0, 16750)	0.11805609090251878
  (0, 10162)	0.1109527993646851
  (0, 15103)	0.0757160310090233
  (0, 30648)	0.1609418803035905
  (0, 27207)	0.3070935299259363
  (0, 14818)	0.13809323941878146
  (0, 19288)	0.1609418803035905
  (0, 26108)	0.09717048249565605
  (0, 140)	0.08345528106048351
  (0, 19535)	0.058902185340217544
  (0, 26008)	0.08905295198564415
  (0, 5109)	0.09890880660467016
  (0, 26855)	0.1041557244396881
  (0, 33862)	0.10151876544675342
  (0, 11199)	0.028341562101549547
  (0, 34645)	0.0661790323738927
  (0, 14883)	0.11347246793044276
  (0, 6271)	0.06009284285933516
  (0, 34083)	0.07017323138954028
  (0, 5130)	0.11037374760102926
  (0, 8446)	0.054496464397667206
  (0, 13822)	0.0674337572727897
  (0, 11873)	0.15354676496296815
  (0, 16666)	0.1609418803035905
  (0, 7008)	0.07266264386702191
  :	:
  (4999, 10310)	0.1596773860956138
  (4999, 10790)	0.09598741722187902
  (4999, 32222)	0.10641511572101618
  (4999, 14907)	0.11653253375671793
  (4999, 27641)	0.09570558862331932
  (

In [None]:
print(tfidfModel.fit_transform(frame1["review"]).todense())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
tfidfModel.vocabulary_

{'grew': 13353,
 'watch': 34657,
 'love': 18617,
 'thunderbird': 31992,
 'mate': 19454,
 'school': 27492,
 'play': 24076,
 'befor': 2757,
 'dure': 9354,
 'lunch': 18767,
 'want': 34544,
 'virgil': 34278,
 'scott': 27581,
 'one': 22440,
 'alan': 669,
 'count': 6793,
 'becam': 2698,
 'art': 1668,
 'form': 11765,
 'took': 32300,
 'children': 5437,
 'see': 27768,
 'movi': 20795,
 'hope': 14877,
 'would': 35559,
 'get': 12647,
 'glimps': 12864,
 'child': 5422,
 'bitterli': 3242,
 'disappoint': 8488,
 'onli': 22495,
 'high': 14495,
 'point': 24222,
 'wa': 34431,
 'snappi': 29201,
 'theme': 31703,
 'tune': 32848,
 'could': 6778,
 'compar': 6236,
 'origin': 22645,
 'score': 27563,
 'thank': 31658,
 'earli': 9440,
 'saturday': 27269,
 'morn': 20684,
 'televis': 31492,
 'channel': 5181,
 'still': 30238,
 'rerun': 26160,
 'seri': 28036,
 'gerri': 12637,
 'anderson': 1121,
 'hi': 14465,
 'wife': 35161,
 'creat': 7008,
 'jonatha': 16666,
 'frake': 11873,
 'hand': 13822,
 'director': 8446,
 'chair':

In [None]:
tfidf_df = pd.DataFrame(tfidfModel.fit_transform(frame1["review"]).todense())
tfidf_df.columns = sorted(tfidfModel.vocabulary_)
tfidf_df


Unnamed: 0,aa,aaaaah,aaah,aaargh,aaip,aakash,aaliyah,aamir,aamr,aankhen,...,zurich,zutaut,zwart,zwick,zy,zzzz,zzzzzzzzzzzz,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Define the custom labels
positive = 'Positive'
negative = 'Negative'

# Replace values in the 'sentiment' column with custom labels
frame1['sentiment'] = frame1['sentiment'].replace({positive :1,negative :0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frame1['sentiment'] = frame1['sentiment'].replace({positive :1,negative :0})


In [None]:
frame1

Unnamed: 0,review,sentiment
0,grew b watch love thunderbird mate school watc...,0
1,put thi movi dvd player sat coke chip expect w...,0
2,whi peopl know particular time past wa like fe...,0
3,even though great interest biblic movi wa bore...,0
4,im die hard dad armi fan noth ever chang got t...,1
...,...,...
4995,absolut putrid slasher film ha one redeem qual...,0
4996,one franc farmer earliest movi absolut beauti ...,0
4997,rent thi think would pretti good cover movi ca...,0
4998,went thi movi becaus husband enjoy origin vers...,0


# Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split #used to divide the dayta for trainig and testing

# Splitting the data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(tfidf_df, frame1['sentiment'], test_size=0.2, random_state=42, stratify=frame1['sentiment'])


# Model 1
Logistic Regression

Accuracy is 0.9462

In [None]:
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
logreg = LogisticRegression(max_iter=1000)

# Train the model using training data
logreg.fit(X_train, y_train)


In [None]:
logreg.fit(tfidf_df, frame1['sentiment'])


In [None]:
predicted_labels_train = logreg.predict(tfidf_df)


In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(frame1['sentiment'], predicted_labels_train)
print("Training Accuracy:", accuracy)


Training Accuracy: 0.9462


# Model 2
Supervised Machine Learning

Accuracy is 0.854

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# You've already transformed your data using TF-IDF, so no need to use CountVectorizer again
X = tfidf_df

# Assuming the 'sentiment' column in frame1 is your target variable
y = frame1['sentiment']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model (you've done this step already, but for clarity, I'm keeping it)
logreg = LogisticRegression(penalty='l2', max_iter=1000, C=1, random_state=42)

# Fitting the model on the training data
logreg.fit(X_train, y_train)

# Make predictions on the test data
y_pred = logreg.predict(X_test)

# Calculate and print accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.854
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.85      0.85       495
           1       0.85      0.86      0.86       505

    accuracy                           0.85      1000
   macro avg       0.85      0.85      0.85      1000
weighted avg       0.85      0.85      0.85      1000



In [None]:
# Make predictions on new text data
new_text = ['the t-shirt was bad']

# Convert the new text data using the TF-IDF vectorizer
new_text_tfidf = tfidfModel.transform(new_text)

# Make predictions using the trained Logistic Regression model (logreg)
y_pred = logreg.predict(new_text_tfidf)

print("Predicted Sentiment:", y_pred[0])


Predicted Sentiment: 0


