#***Intensity Analysis***
(Build your own model using NLP and Python)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
#warnings
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
print('stopwords:',stopwords.words('english'))

stopwords: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so'

#***Load the Dataset:-***

In [None]:
# Data Collection
happiness_data = pd.read_csv('/content/happiness.csv')
sadness_data = pd.read_csv('/content/sadness.csv')
angerness_data = pd.read_csv('/content/angriness.csv')

In [None]:
happiness_data

Unnamed: 0,content,intensity
0,Wants to know how the hell I can remember word...,happiness
1,Love is a long sweet dream & marriage is an al...,happiness
2,The world could be amazing when you are slight...,happiness
3,My secret talent is getting tired without doin...,happiness
4,"Khatarnaak Whatsapp Status Ever‚Ä¶ Can\‚Äôt talk, ...",happiness
...,...,...
703,"If I know what love is, it is because of you.",happiness
704,The spaces between your fingers are meant to b...,happiness
705,In you i H've Found the love of my life and my...,happiness
706,The magic of first love is our ignorance that ...,happiness


In [None]:
#duplicated rows
happiness_data.duplicated().sum()

4

In [None]:
sadness_data

Unnamed: 0,content,intensity
0,"Never hurt people who love you a lot, because ...",sadness
1,Don‚Äôt expect me to tell you what you did wrong...,sadness
2,I preferred walking away than fighting for you...,sadness
3,"Moving forward in life isn‚Äôt the hard part, it...",sadness
4,"Never cry for anyone in your life, because tho...",sadness
...,...,...
630,Stop crying over yesterday and start smiling f...,sadness
631,An Eye with Dust ‚Äòn A Heart with Trust Always ...,sadness
632,Tears come from the heart and not from the brain.,sadness
633,"Sometimes you have to hold your head up high, ...",sadness


In [None]:
angerness_data

Unnamed: 0,content,intensity
0,"Sometimes I‚Äôm not angry, I‚Äôm hurt and there‚Äôs ...",angriness
1,Not available for busy people‚ò∫,angriness
2,I do not exist to impress the world. I exist t...,angriness
3,Everything is getting expensive except some pe...,angriness
4,My phone screen is brighter than my future üôÅ,angriness
...,...,...
691,Is getting close to just walking away?,angriness
692,Did you ever just want to slap the stupid out ...,angriness
693,"Is singing, if you‚Äôre angry & you know it punc...",angriness
694,Telling someone to calm down just makes them s...,angriness


#***Merge the dataset:-***

In [None]:
# Combine datasets
df= pd.concat([happiness_data, sadness_data, angerness_data], ignore_index=True)

In [None]:
df

Unnamed: 0,content,intensity
0,Wants to know how the hell I can remember word...,happiness
1,Love is a long sweet dream & marriage is an al...,happiness
2,The world could be amazing when you are slight...,happiness
3,My secret talent is getting tired without doin...,happiness
4,"Khatarnaak Whatsapp Status Ever‚Ä¶ Can\‚Äôt talk, ...",happiness
...,...,...
2034,Is getting close to just walking away?,angriness
2035,Did you ever just want to slap the stupid out ...,angriness
2036,"Is singing, if you‚Äôre angry & you know it punc...",angriness
2037,Telling someone to calm down just makes them s...,angriness


In [None]:
#check the information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2039 entries, 0 to 2038
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   content    2039 non-null   object
 1   intensity  2039 non-null   object
dtypes: object(2)
memory usage: 32.0+ KB


In [None]:
#check the null values
df.isnull().sum()

Unnamed: 0,0
content,0
intensity,0


In [None]:
#check the duplicated
df.duplicated().sum()

447

In [None]:
#drop the duplicated row
df.drop_duplicates(inplace=True)

In [None]:
#check the shape of dataset
df.shape

(1592, 2)

In [None]:
#save merge dataset
df.to_csv('merged_dataset.csv', index=False)

#***Data Preprossing:-***

In [None]:
port_stem=PorterStemmer()

stemming
-it is process of reducing a word to its root word

In [None]:
#create steamming function
def stemming(content):
  stemmed_content=re.sub('[^a-zA-Z]',' ',content)
  #lower letter the every word in content
  stemmed_content=stemmed_content.lower()
  #split the sentence then consider the word in stemmed_content
  stemmed_content=stemmed_content.split()
  #remove stopword from sentence in text
  stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  #join the content word in to stemmed_content one by one
  stemmed_content=' '.join(stemmed_content)
  return stemmed_content

In [None]:
#create column steammed content
df['Content']=df['content'].apply(stemming)

In [None]:
#creating function to cleaning the text data
def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

In [None]:
#remove all unecessay text
df['Content']=df['content'].apply(preprocess_text)

In [None]:
#drop the content column
df.drop('content',axis=1,inplace=True)

In [None]:
df

Unnamed: 0,intensity,Content
0,happiness,wants to know how the hell i can remember word...
1,happiness,love is a long sweet dream marriage is an ala...
2,happiness,the world could be amazing when you are slight...
3,happiness,my secret talent is getting tired without doin...
4,happiness,khatarnaak whatsapp status ever cant talk wife...
...,...,...
2024,angriness,the embarrassment after the anger is the bigge...
2028,angriness,the strong man is not the good wrestler the st...
2030,angriness,a man is about as big as the things that make ...
2036,angriness,is singing if youre angry you know it punches...


In [None]:
df['intensity'].value_counts()

Unnamed: 0_level_0,count
intensity,Unnamed: 1_level_1
happiness,704
angriness,498
sadness,390


In [None]:
# Convert target column to numerical
df['intensity'] =df['intensity'].map({'happiness': 0, 'sadness': 1, 'angriness': 2})

In [None]:
df['intensity'].value_counts()

Unnamed: 0_level_0,count
intensity,Unnamed: 1_level_1
0,704
2,498
1,390


In [None]:
df.isnull().sum()

Unnamed: 0,0
intensity,0
Content,0


In [None]:
# Feature Engineering
X = df['Content'].values
y = df['intensity'].values

In [None]:
X

array(['wants to know how the hell i can remember words to songs from years ago but cant remember what i went into the next room for',
       'love is a long sweet dream  marriage is an alarm clock',
       'the world could be amazing when you are slightly strange', ...,
       'a man is about as big as the things that make him angry',
       'is singing if youre angry  you know it punches their face',
       'telling someone to calm down just makes them so much angrier'],
      dtype=object)

In [None]:
y

array([0, 0, 0, ..., 2, 2, 2])

In [None]:
X.shape

(1592,)

In [None]:
y.shape

(1592,)

#***Spliting the Dataset Train/Test:-***

In [None]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#shape of daatset
print("X_train shape:-",X_train.shape)
print("X_test shape:-",X_test.shape)
print("y_train shape:-",y_train.shape)
print("y_test shape:-",y_test.shape)

X_train shape:- (1273,)
X_test shape:- (319,)
y_train shape:- (1273,)
y_test shape:- (319,)


#***Vectorizer the dataset:-***

In [None]:
#convert the textual data to numerical data
vectorizer=TfidfVectorizer()

X_train=vectorizer.fit_transform(X_train)
X_test=vectorizer.transform(X_test)

In [None]:
#save the vectorizer
pickle.dump(vectorizer,open('vectorizer.pkl','wb'))
print('vectorizer save sucessfully')

vectorizer save sucessfully


In [None]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")

X_train shape: (1273, 2266)
y_train shape: (1273,)


#***Build the model:-***

In [None]:
#classification model to train the dataset
model=LogisticRegression(max_iter=1000)

In [None]:
#model fiting
model.fit(X_train,y_train)

In [None]:
#model evaluation on accuracy score on traing data
X_train_prediction=model.predict(X_train)
training_data_accuracy=accuracy_score(X_train_prediction,y_train)

In [None]:
#print accuracy sore on traing data
print('accuracy score on training data:',training_data_accuracy)

accuracy score on training data: 0.8978790259230165


In [None]:
#model evaluation on accuracy score on test  data
X_test_prediction=model.predict(X_test)
test_data_accuracy=accuracy_score(X_test_prediction,y_test)

In [None]:
#print accuracy sore on traing data
print('accuracy score on test data:',test_data_accuracy)

accuracy score on test data: 0.7178683385579937


#***Hyperparameter Tuning:-***

In [None]:
#hyperparametr tuning in logistic regression
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [None]:
best_params

{'C': 10}

In [None]:
best_model

#***check model accuracy after hyperparameter tuning:-***

In [None]:
#best model evaluation on accuracy score on traing data
X_train_prediction=best_model.predict(X_train)
training_data_accuracy=accuracy_score(X_train_prediction,y_train)

In [None]:
#print accuracy sore on traing data
print('accuracy score on test data:',training_data_accuracy)

accuracy score on test data: 0.9890023566378633


#***Save the model:-***

In [None]:
#save the model
import pickle
pickle.dump(best_model,open('best_model.pkl','wb'))
print('Model save sucessfully')

Model save sucessfully
