# Data Science and Machine Learning Internship Program


#  Mini Project 3 –    Twitter Sentimental Analysis Using NLP and Python

# Scenario: 

By analyzing text data, we can find meaningful insights from non-numeric data that
can help us achieve our objective. With the help of NLP and its concepts, we can do it. Twitter is
one of the biggest platforms that people use to write their messages, express their feelings
about a particular topic, and share knowledge in the form of text. By analyzing text data, we can
make good decisions for different use cases like judging the sentiment of the human tweets, and
any product review/comments can tell us the performance of a product in the market.


# Importing Neccessary libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Task 1. Read the Data from the Given excel file.

In [2]:
# Read the csv file
data = pd.read_csv('Twitter_Data.csv')

# Display the first few rows
data.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


# Task 2. Change our dependent variable to categorical. ( 0 to “Neutral,” -1 to “Negative”, 1 to “Positive”)


In [3]:
# Map numeric categories to strings
category_mapping = {0: "Neutral", -1: "Negative", 1: "Positive"}
data['category'] = data['category'].map(category_mapping)

# Verify changes
data.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,Negative
1,talk all the nonsense and continue all the dra...,Neutral
2,what did just say vote for modi welcome bjp t...,Positive
3,asking his supporters prefix chowkidar their n...,Positive
4,answer who among these the most powerful world...,Positive


# Task 3. Do Missing value analysis and drop all null/missing values

In [4]:
# Check for missing values
data.isnull().sum()

# Drop rows with any missing values
data = data.dropna()

# Verify no missing values
data.isnull().sum()

clean_text    0
category      0
dtype: int64

# Task 4. Do text cleaning. (remove every symbol except alphanumeric, transform all words tolower case, and remove punctuation and stopwords)


In [5]:
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove non-alphanumeric characters
    text = text.lower()  # Convert to lowercase
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

# Apply text cleaning
data['cleaned_tweet'] = data['clean_text'].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rohin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
data['cleaned_tweet']

0         modi promised minimum government maximum gover...
1                    talk nonsense continue drama vote modi
2         say vote modi welcome bjp told rahul main camp...
3         asking supporters prefix chowkidar names modi ...
4         answer among powerful world leader today trump...
                                ...                        
162975    456 crores paid neerav modi recovered congress...
162976    dear rss terrorist payal gawar modi killing 10...
162977                         cover interaction forum left
162978    big project came india modi dream project happ...
162979    ever listen like gurukul discipline maintained...
Name: cleaned_tweet, Length: 162969, dtype: object

# Task 5. Create a new column and find the length of each sentence (how many words they contain)

In [7]:
# Calculate the length of each sentence
data['sentence_length'] = data['cleaned_tweet'].apply(lambda x: len(x.split()))

# Task 6. Split data into dependent(X) and independent(y) dataframe

In [8]:
# Separate the features and the target variable
X = data['cleaned_tweet']
y = data['category']

In [9]:
X

0         modi promised minimum government maximum gover...
1                    talk nonsense continue drama vote modi
2         say vote modi welcome bjp told rahul main camp...
3         asking supporters prefix chowkidar names modi ...
4         answer among powerful world leader today trump...
                                ...                        
162975    456 crores paid neerav modi recovered congress...
162976    dear rss terrorist payal gawar modi killing 10...
162977                         cover interaction forum left
162978    big project came india modi dream project happ...
162979    ever listen like gurukul discipline maintained...
Name: cleaned_tweet, Length: 162969, dtype: object

In [10]:
y 

0         Negative
1          Neutral
2         Positive
3         Positive
4         Positive
            ...   
162975    Negative
162976    Negative
162977     Neutral
162978     Neutral
162979    Positive
Name: category, Length: 162969, dtype: object

# Task 7. Do operations on text data 

In [11]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

# Pad sequences
max_length = max(data['sentence_length'])
X_padded = pad_sequences(sequences, maxlen=max_length, padding='post')

# Encode target variable
y_encoded = pd.get_dummies(y)

In [12]:
y_encoded

Unnamed: 0,Negative,Neutral,Positive
0,True,False,False
1,False,True,False
2,False,False,True
3,False,False,True
4,False,False,True
...,...,...,...
162975,True,False,False
162976,True,False,False
162977,False,True,False
162978,False,True,False


# Task 8. • Train new model

In [13]:
# Parameters
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
input_length = max_length

# Build the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))  # 3 classes: Neutral, Negative, Positive

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Split the Data into Train and Test Sets

In [14]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)

# Train the Model

In [15]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Task 9. Normalize the prediction as same as the original data(prediction might be in decimal, so whoever is nearest to 1 is predicted as yes and set other as 0)

In [16]:
# Predict and normalize predictions
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=-1)
y_true = y_test.values.argmax(axis=-1)

# Print classification report
print(classification_report(y_true, y_pred_classes, target_names=category_mapping.values()))

              precision    recall  f1-score   support

     Neutral       0.80      0.81      0.81      7152
    Negative       0.91      0.92      0.91     11067
    Positive       0.90      0.89      0.89     14375

    accuracy                           0.88     32594
   macro avg       0.87      0.87      0.87     32594
weighted avg       0.88      0.88      0.88     32594

