In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
import warnings
%matplotlib inline

warnings.filterwarnings('ignore')

file_path = 'C:/Users/nikhil/Programming/Jupyter Notebook/datasets/twitter_dataset.csv'


In [2]:
df = pd.read_csv('C:/Users/Nikhil/OneDrive/AppData/Desktop/project/Twitter_Data.csv', encoding='iso-8859-1')
df.head()

Unnamed: 0,tweet,label
0,when modi promised âminimum government maxim...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [3]:
# datatype info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   tweet   162976 non-null  object 
 1   label   162973 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.5+ MB


In [4]:
# removes pattern in the input text
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for word in r:
        input_txt = re.sub(word, "", input_txt)
    return input_txt

In [5]:
df.head()

Unnamed: 0,tweet,label
0,when modi promised âminimum government maxim...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [6]:
# Assuming 'tweet' is the column in your DataFrame
# First, replace NaN values with empty strings
df['tweet'] = df['tweet'].fillna('')

# Then, convert the 'tweet' column to string data type
df['tweet'] = df['tweet'].astype(str)

# Apply the remove_pattern function
# remove twitter handles (@user)
df['clean_tweet'] = np.vectorize(remove_pattern)(df['tweet'], "@[\w]*")

In [7]:
df.head()

Unnamed: 0,tweet,label,clean_tweet
0,when modi promised âminimum government maxim...,-1.0,when modi promised âminimum government maxim...
1,talk all the nonsense and continue all the dra...,0.0,talk all the nonsense and continue all the dra...
2,what did just say vote for modi welcome bjp t...,1.0,what did just say vote for modi welcome bjp t...
3,asking his supporters prefix chowkidar their n...,1.0,asking his supporters prefix chowkidar their n...
4,answer who among these the most powerful world...,1.0,answer who among these the most powerful world...


In [8]:
# remove special characters, numbers and punctuations
df['clean_tweet'] = df['tweet'].str.replace("[^a-zA-Z#]", " ")
df.head()

Unnamed: 0,tweet,label,clean_tweet
0,when modi promised âminimum government maxim...,-1.0,when modi promised âminimum government maxim...
1,talk all the nonsense and continue all the dra...,0.0,talk all the nonsense and continue all the dra...
2,what did just say vote for modi welcome bjp t...,1.0,what did just say vote for modi welcome bjp t...
3,asking his supporters prefix chowkidar their n...,1.0,asking his supporters prefix chowkidar their n...
4,answer who among these the most powerful world...,1.0,answer who among these the most powerful world...


In [9]:
# remove short words
df['clean_tweet'] = df['tweet'].apply(lambda x: " ".join([w for w in x.split() if len(w)>3]))
df.head()

Unnamed: 0,tweet,label,clean_tweet
0,when modi promised âminimum government maxim...,-1.0,when modi promised âminimum government maxim...
1,talk all the nonsense and continue all the dra...,0.0,talk nonsense continue drama will vote modi
2,what did just say vote for modi welcome bjp t...,1.0,what just vote modi welcome told rahul main ca...
3,asking his supporters prefix chowkidar their n...,1.0,asking supporters prefix chowkidar their names...
4,answer who among these the most powerful world...,1.0,answer among these most powerful world leader ...


In [10]:
# individual words considered as tokens
tokenized_tweet = df['clean_tweet'].apply(lambda x: x.split())
tokenized_tweet.head()

0    [when, modi, promised, âminimum, government,...
1    [talk, nonsense, continue, drama, will, vote, ...
2    [what, just, vote, modi, welcome, told, rahul,...
3    [asking, supporters, prefix, chowkidar, their,...
4    [answer, among, these, most, powerful, world, ...
Name: clean_tweet, dtype: object

In [None]:
# stem the words
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda sentence: [stemmer.stem(word) for word in sentence])
tokenized_tweet.head()

In [None]:
# combine words into single sentence
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = " ".join(tokenized_tweet[i])
    
df['clean_tweet'] = tokenized_tweet
df.head()

In [None]:
# !pip install wordcloud

In [None]:
# visualize the frequent words
all_words = " ".join([sentence for sentence in df['clean_tweet']])

from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=500, random_state=42, max_font_size=100).generate(all_words)

# plot the graph
plt.figure(figsize=(15,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# frequent words visualization for +ve
all_words = " ".join([sentence for sentence in df['clean_tweet'][df['label']==0]])

wordcloud = WordCloud(width=800, height=500, random_state=42, max_font_size=100).generate(all_words)

# plot the graph
plt.figure(figsize=(15,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# frequent words visualization for -ve
all_words = " ".join([sentence for sentence in df['clean_tweet'][df['label']==1]])

wordcloud = WordCloud(width=800, height=500, random_state=42, max_font_size=100).generate(all_words)

# plot the graph
plt.figure(figsize=(15,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# extract the hashtag
def hashtag_extract(tweets):
    hashtags = []
    # loop words in the tweet
    for tweet in tweets:
        ht = re.findall(r"#(\w+)", tweet)
        hashtags.append(ht)
    return hashtags

In [None]:
# extract hashtags from non-racist/sexist tweets
ht_positive = hashtag_extract(df['clean_tweet'][df['label']==0])

# extract hashtags from racist/sexist tweets
ht_negative = hashtag_extract(df['clean_tweet'][df['label']==1])

In [None]:
ht_positive[:5]

In [None]:
# Assuming you have a list of lists with hashtags in ht_positive
# Extract hashtags from the list of lists
hashtags = [item for sublist in ht_positive for item in sublist]

# Create a frequency distribution of hashtags
freq = nltk.FreqDist(hashtags)

# Create a DataFrame to display the top hashtags and their counts
d = pd.DataFrame({'Hashtag': list(freq.keys()), 'Count': list(freq.values())})

# Display the top rows of the DataFrame
print(d.head())


In [None]:
# freq = nltk.FreqDist(ht_positive)
# d = pd.DataFrame({'Hashtag': list(freq.keys()),
#                  'Count': list(freq.values())})
# d.head()

In [None]:
# select top 10 hashtags
d = d.nlargest(columns='Count', n=10)
plt.figure(figsize=(15,9))
sns.barplot(data=d, x='Hashtag', y='Count')
plt.show()

In [None]:
freq = nltk.FreqDist(hashtags)
d = pd.DataFrame({'Hashtag': list(freq.keys()),
                 'Count': list(freq.values())})
d.head()

In [None]:
# select top 10 hashtags
d = d.nlargest(columns='Count', n=10)
plt.figure(figsize=(15,9))
sns.barplot(data=d, x='Hashtag', y='Count')
plt.show()

In [None]:
# feature extraction
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
bow = bow_vectorizer.fit_transform(df['clean_tweet'])

In [None]:
# bow[0].toarray()

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(bow, df['label'], random_state=42, test_size=0.25)

Model Training

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score

In [None]:
# training
model = LogisticRegression()
model.fit(x_train, y_train)

In [None]:
# testing
pred = model.predict(x_test)
f1_score(y_test, pred)

In [None]:
accuracy_score(y_test,pred)

In [None]:
# use probability to get output
pred_prob = model.predict_proba(x_test)
pred = pred_prob[:, 1] >= 0.3
pred = pred.astype(np.int)

f1_score(y_test, pred)

In [None]:
accuracy_score(y_test,pred)

In [None]:
pred_prob[0][1] >= 0.3