In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
# df = pd.read_csv("sentiment labelled sentences/data.csv")
df = pd.read_csv("EcoPreprocessed.csv")


In [3]:
df.head(100)

Unnamed: 0.1,Unnamed: 0,review,polarity,division
0,3870,able play youtube alexa,0.5000,positive
1,62,able recognize indian accent really well drop ...,0.2794,positive
2,487,absolute smart device amazon connect external ...,0.1827,positive
3,3204,absolutely amaze new member family control hom...,0.3682,positive
4,1265,absolutely amaze previously sceptical invest m...,0.2333,positive
...,...,...,...,...
95,2403,alexa good thing will help us study good alexa...,0.7233,positive
96,50,alexa good understand indian accent cover hind...,0.3229,positive
97,346,alexa good understand query well conversation ...,0.3667,positive
98,743,alexa good value money fun use use india make ...,0.3750,positive


In [4]:
print(df.describe())

        Unnamed: 0     polarity
count  4084.000000  4084.000000
mean   2070.871205     0.350653
std    1203.976943     0.367858
min       0.000000    -1.000000
25%    1024.750000     0.006725
50%    2061.500000     0.383300
75%    3119.250000     0.607100
max    4156.000000     1.000000


In [5]:
df = df.drop(["Unnamed: 0","polarity"],axis =1)
df.head()

Unnamed: 0,review,division
0,able play youtube alexa,positive
1,able recognize indian accent really well drop ...,positive
2,absolute smart device amazon connect external ...,positive
3,absolutely amaze new member family control hom...,positive
4,absolutely amaze previously sceptical invest m...,positive


In [6]:
df.rename(columns={'division':'label'}, inplace=True)

In [7]:
df.head()

Unnamed: 0,review,label
0,able play youtube alexa,positive
1,able recognize indian accent really well drop ...,positive
2,absolute smart device amazon connect external ...,positive
3,absolutely amaze new member family control hom...,positive
4,absolutely amaze previously sceptical invest m...,positive


In [8]:
df.rename(columns={'review': 'sentence'}, inplace=True)

In [9]:
df.head()

Unnamed: 0,sentence,label
0,able play youtube alexa,positive
1,able recognize indian accent really well drop ...,positive
2,absolute smart device amazon connect external ...,positive
3,absolutely amaze new member family control hom...,positive
4,absolutely amaze previously sceptical invest m...,positive


In [10]:
import nltk
nltk.download('wordnet')

[nltk_data] Error loading wordnet: <urlopen error [Errno 60] Operation
[nltk_data]     timed out>


False

In [11]:
#Convert labels to numeric values if they are in text format (e.g., 'positive', 'negative', 'neutral' -> 1, -1, 0)
label_mapping = {'positive': 1, 'negative': -1, 'neutral': 0}
df['label'] = df['label'].map(label_mapping)


In [12]:
df.tail()

Unnamed: 0,sentence,label
4079,yo yo yo love go if want one smart speaker val...,1
4080,youtube music,0
4081,youtube support nahi kartasong recognise achha...,0
4082,yup proscontrols wipro light amazinglysony bra...,0
4083,zero integration capabilities fire tv devices ...,-1


In [13]:
 #Function to preprocess a text string\
stop_words = set(stopwords.words('english'))  # NLTK stopwords
def preprocess_text(text):
    # Remove special characters, numbers, and punctuations
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize the text (split it into words)
    tokens = text.split()
    
    # Remove stopwords and lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    # Rejoin tokens into a single string
    text = ' '.join(tokens)
    
    return text

# Apply preprocessing to the 'sentence' column
df['sentence'] = df['sentence'].apply(preprocess_text)

In [14]:
df['sentence']

0                                 able play youtube alexa
1       able recognize indian accent really well drop ...
2       absolute smart device amazon connect external ...
3       absolutely amaze new member family control hom...
4       absolutely amaze previously sceptical invest m...
                              ...                        
4079    yo yo yo love go want one smart speaker value ...
4080                                        youtube music
4081    youtube support nahi kartasong recognise achha...
4082    yup proscontrols wipro light amazinglysony bra...
4083    zero integration capability fire tv device use...
Name: sentence, Length: 4084, dtype: object

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer


# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000,  # Set the maximum number of features (words) to consider
                                   stop_words='english',  # Remove common English stop words
                                   )
tfidf = tfidf_vectorizer

# Fit and transform the text data to create TF-IDF features
tfidf_features = tfidf.fit_transform(df['sentence'])

# Convert the TF-IDF features to a DataFrame
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Combine the TF-IDF features DataFrame with the 'label' column
final_df = pd.concat([tfidf_df, df['label']], axis=1)




In [16]:
df['sentence']

0                                 able play youtube alexa
1       able recognize indian accent really well drop ...
2       absolute smart device amazon connect external ...
3       absolutely amaze new member family control hom...
4       absolutely amaze previously sceptical invest m...
                              ...                        
4079    yo yo yo love go want one smart speaker value ...
4080                                        youtube music
4081    youtube support nahi kartasong recognise achha...
4082    yup proscontrols wipro light amazinglysony bra...
4083    zero integration capability fire tv device use...
Name: sentence, Length: 4084, dtype: object

In [17]:
df['label']

0       1
1       1
2       1
3       1
4       1
       ..
4079    1
4080    0
4081    0
4082    0
4083   -1
Name: label, Length: 4084, dtype: int64

In [18]:
final_df

Unnamed: 0,aap,ability,able,absolutely,ac,accent,accept,access,accord,account,...,wynk,year,yell,yes,yo,youll,youre,youtube,zoom,label
0,0.0,0.0,0.601345,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.607512,0.0,1
1,0.0,0.0,0.168119,0.000000,0.0,0.220581,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,1
2,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,1
3,0.0,0.0,0.000000,0.367169,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,1
4,0.0,0.0,0.000000,0.671650,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4079,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.918169,0.0,0.0,0.000000,0.0,1
4080,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.808798,0.0,0
4081,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.302524,0.0,0
4082,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0


In [19]:
# Split the data into training and testing sets (e.g., 80% for training and 20% for testing)
x = final_df.drop('label', axis=1)  # Features
y = final_df['label']  # Labels

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=0)
X_train.head(10)

Unnamed: 0,aap,ability,able,absolutely,ac,accent,accept,access,accord,account,...,wrong,wynk,year,yell,yes,yo,youll,youre,youtube,zoom
3223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3785,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3817,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1259,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3722,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
  #Random Forest
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=50, criterion='gini',random_state=0)
model.fit(X_train, Y_train)

In [22]:
  print('Random Forest Training Accuracy:', model.score(X_train, Y_train))

Random Forest Training Accuracy: 0.9880624426078971


In [23]:

from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test)
accuracy_score(Y_test, y_pred)

0.8739290085679314

In [24]:
import pickle

In [33]:
review = ["Not responding gives the impression that you don't care about your customers and their experience"]
review = tfidf.transform(review)
predict = model.predict(review)

if predict == 1:
    print('Positive')
elif predict == 0:
    print('neutral')
else:
    print('negative')

neutral


In [26]:
# some negative words
# Poor quality of work
# Not responding gives the impression that you don't care about your customers and their experience

In [27]:
# positive
# The staff was great. The receptionists were very helpful and answered all our questions. The room was clean and bright, and the room service was always on time. Will be coming back! Thank you so much.

In [28]:
import pickle

# Save the trained model to a file
model_filename = 'sentiment_analysis_model.pkl'
with open(model_filename, 'wb') as model_file:
    pickle.dump(model, model_file)

# Now, you can use this pickle file to load the model later
