In [53]:
import pandas as pd
import numpy as np

df= pd.read_csv("chat_dataset.csv")

In [54]:
df["sentiment"].value_counts()

sentiment
neutral     259
positive    178
negative    147
Name: count, dtype: int64

In [55]:
df.head()

Unnamed: 0,message,sentiment
0,I really enjoyed the movie,positive
1,The food was terrible,negative
2,I'm not sure how I feel about this,neutral
3,The service was excellent,positive
4,I had a bad experience,negative


In [56]:
import re 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [57]:
df.isnull().sum()

message      0
sentiment    0
dtype: int64

# LABEL ENCODING OUTPUT

In [58]:
df["sentiment"]

0      positive
1      negative
2       neutral
3      positive
4      negative
         ...   
579    negative
580    negative
581    negative
582    negative
583    negative
Name: sentiment, Length: 584, dtype: object

In [59]:
from sklearn.preprocessing import LabelEncoder
encoder= LabelEncoder()



In [60]:
df["sentiment"]= encoder.fit_transform(df["sentiment"])

mapping = dict(zip(encoder.classes_, range(len(encoder.classes_))))

# Display the mapping
print(mapping)

{'negative': 0, 'neutral': 1, 'positive': 2}


In [61]:
# mapping is 
# {'negative': 0, 'neutral': 1, 'positive': 2}

In [62]:
df.head()

Unnamed: 0,message,sentiment
0,I really enjoyed the movie,2
1,The food was terrible,0
2,I'm not sure how I feel about this,1
3,The service was excellent,2
4,I had a bad experience,0


# Word 2 Vec

In [63]:
#stemming
#removig stop words
#applying tfdif with ngrams
#reassuring data

In [64]:
porter_stemmer=PorterStemmer()

def stemming(content):
    st_content= re.sub('[^a-zA-Z]',' ',content)
    st_content= st_content.lower()
    st_content= st_content.split()
    st_content= [porter_stemmer.stem(word) for word in st_content if not word in stopwords.words('english')]
    st_content= ' '.join(st_content)
    return st_content

In [65]:
df["message"]=df["message"].apply(stemming)

In [66]:
df.head()

Unnamed: 0,message,sentiment
0,realli enjoy movi,2
1,food terribl,0
2,sure feel,1
3,servic excel,2
4,bad experi,0


In [67]:
df.shape

(584, 2)

In [68]:
x=df["message"]
y=df["sentiment"]

# Vectorizing

In [69]:
vectorizer= TfidfVectorizer()
vectorizer.fit(x)
x= vectorizer.transform(x)

In [70]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=45)


In [71]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix

In [72]:
models= [LogisticRegression,SVC,DecisionTreeClassifier,RandomForestClassifier]

In [73]:
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []


In [74]:
for model in models :
    print(f"Evaluating model : {model.__name__}")

    classifier= model().fit(x_train,y_train)
    y_pred=classifier.predict(x_test)
    print(accuracy_score(y_test, y_pred))
    #validation
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred, average='macro'))
    recall_scores.append(recall_score(y_test, y_pred, average='macro'))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))


    

Evaluating model : LogisticRegression
0.8461538461538461
Evaluating model : SVC
0.8376068376068376
Evaluating model : DecisionTreeClassifier
0.7435897435897436
Evaluating model : RandomForestClassifier
0.7863247863247863


In [75]:
#TESTING TIME

In [76]:
	# I don't have strong feelings either way	->neutral
	# The hotel was comfortable->	positive
	# This software is very complicated->	negative
	# The game was challenging->	positive
	# The speaker was inspiring->	positive


In [77]:
model=LogisticRegression()
classifier= model.fit(x_train,y_train)
y_pred=classifier.predict(x_test)

 

text = "I don't have strong feelings either way"
"""
steps for inputing data
stemming
vectorizing,just transform
inputting data

"""
#1 stemming: 
text=stemming(text)

#2 vectorizing: 
# text=vectorizer.transform(text)
# text
text = vectorizer.transform([text])
res= model.predict(text)

if (res==0):
    print("negative")
elif(res==1):
    print("neutral")
else :
    print("positive")

neutral


In [80]:
texts = [
    "I don't have strong feelings either way",
    "The hotel was comfortable",
    "This software is very complicated",
    "The game was challenging",
    "The speaker was inspiring"
]
result=[]
for text in texts :    
  
    """
    steps for inputing data
    stemming
    vectorizing,just transform
    inputting data
    
    """
    #1 stemming: 
    text=stemming(text)
    
    #2 vectorizing: 
    # text=vectorizer.transform(text)
    # text
    text = vectorizer.transform([text])
    res= model.predict(text)
    
    if (res==0):
        print("negative")
        result.append("negative") 
    elif(res==1):
        print("neutral")
        result.append("neutral") 
    else :
        print("positive")
        result.append("positive") 
           
# pretty much good accuracy


neutral
positive
negative
positive
neutral


In [None]:
	# I don't have strong feelings either way	->neutral
	# The hotel was comfortable->	positive
	# This software is very complicated->	negative
	# The game was challenging->	positive
	# The speaker was inspiring->	positive

In [81]:
result

['neutral', 'positive', 'negative', 'positive', 'neutral']

In [82]:
labels =result

negative_count = labels.count('negative')
percentage_negative = (negative_count / len(labels)) * 100

print(f"Percentage of 'negative' labels: {percentage_negative}%")


Percentage of 'negative' labels: 20.0%


In [83]:
#Thus You can use it ,for anything ,however u like ,idc