In [1]:
#importing necessary packages
import pandas as pd
import numpy as np
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

### Data Wrangling Exercise

I am using the Twitter Sentiment Analysis Dataset, which can be found here. https://www.kaggle.com/datasets/kazanova/sentiment140. I am using this dataset to build a model that identifies body shaming tweets. 

In [2]:
data = pd.read_csv('Twittertext.csv', encoding='latin-1', header = None)

In [3]:
data.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
data.sample(20)

Unnamed: 0,0,1,2,3,4,5
755665,0,2288402080,Mon Jun 22 19:06:55 PDT 2009,NO_QUERY,TaraPants,Only saw the last few mins of Jon and Kate but...
481381,0,2179620058,Mon Jun 15 09:21:11 PDT 2009,NO_QUERY,VelvetLace,@Michael_Cera I couldn't make it to Alexa Chun...
393179,0,2055335013,Sat Jun 06 09:07:43 PDT 2009,NO_QUERY,Najooj,@OmarKassem ....should i be offended?!?! that ...
426711,0,2063591506,Sun Jun 07 03:17:09 PDT 2009,NO_QUERY,jjjohannaaa,@JessicaViberg I also feel fat .. need to exe...
1409745,4,2056081643,Sat Jun 06 10:29:36 PDT 2009,NO_QUERY,mrdpa,4 weeks off the booze today can't believe I'v...
529553,0,2195510114,Tue Jun 16 11:13:40 PDT 2009,NO_QUERY,colep010,"@pippsqueak sounds lovely, unfortunately i hav..."
812365,4,1548349463,Fri Apr 17 20:42:34 PDT 2009,NO_QUERY,kiss_myy_sasss,oh! I was like when Cobra played 'Smile for t...
1014700,4,1881464730,Fri May 22 04:44:26 PDT 2009,NO_QUERY,j4yloh,Pr Andy just said something really profound: &...
873574,4,1679453416,Sat May 02 09:23:32 PDT 2009,NO_QUERY,natalie_erin_1,Another day to enjoy with my lovely girlfriend...
133929,0,1836128014,Mon May 18 07:25:31 PDT 2009,NO_QUERY,Stuartmcminigal,@lucyrowse why are you so horrible towards me!


In [5]:
#renaming columns
data = data.rename(columns = {0: 'polarity', 1: 'ids', 2: 'date', 3: 'flag', 4: 'user', 5: 'text'})

In [6]:
data.head()
# In the sentiment analysis dataset, 0: negative, 2: neutral, 4: positive 


Unnamed: 0,polarity,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [7]:
data.shape

(1600000, 6)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   polarity  1600000 non-null  int64 
 1   ids       1600000 non-null  int64 
 2   date      1600000 non-null  object
 3   flag      1600000 non-null  object
 4   user      1600000 non-null  object
 5   text      1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


The next steps are to ensure that the texts are readable and analyzable. I will remove any numbers or symbols from the text because they are of no analytical importance.

In [9]:
#https://stackoverflow.com/questions/25292838/applying-regex-to-a-pandas-dataframe
import re

# def string(column):
#     re.sub(r'[^a-zA-Z\s\t]+', "", column)   
#data['text2'] = data['text'].apply(string)
#df['Season2'] = df['Season'].apply(split_it)
# import re
# re.sub(r'[^a-zA-Z\s\t]+', "", list_text)

In [10]:
data['text2'] = data['text'].astype(str)

In [11]:
data.head()

Unnamed: 0,polarity,ids,date,flag,user,text,text2
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","@nationwideclass no, it's not behaving at all...."


In [12]:
#this is to store it as a string
df['text2'] = df['text'].astype('|S')

NameError: name 'df' is not defined

In [None]:
data.info()

In [None]:
#using regular expression
data['text_2'] =  data['text2'].apply(lambda x: re.sub(r'[^a-zA-Z\s\t]+', "", str(x)))

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data['text_2'] = data['text_2'].astype(str)

In [None]:
data.info()

In [None]:
text = data['text_2']

At this point, I am going to explore the corpus of my texts and see which words are the most common. I am going to use a wordcloud generator.

In [None]:
#https://www.datacamp.com/community/tutorials/wordcloud-python
wordcloud = WordCloud().generate(" ".join(text))

In [None]:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

The most common words here are not of interest to my analysis. I will therefore look at tweets that contain some of the most common body shaming related words and generate a word cloud and bar graph out of those tweets. 

In [None]:
#my_vocab = ['ugly body', 'fat body', 'dark skin', 'big body', 'fat ass', 'big boobs', 'hairy', 'hate body', 'heavy body', 'big tummy', 'fat chicks', 'fat girls', 'fat boy', 'fat guys', 'short chicks', 'short guys', 'short man', 'jiggle']

In [None]:
my_vocab = ['ugly', 'fat', 'body', 'dark', 'heavy', 'jiggle', 'short', 'tall', 'skinny']

In [None]:
#data1 = data[data['text_2'].isin(my_vocab)]

In [None]:
#https://stackoverflow.com/questions/28914078/filter-out-rows-based-on-list-of-strings-in-pandas
data['vocab'] = np.where(data.text_2.str.contains('|'.join(my_vocab)),1,0)

In [None]:
data.head()

In [None]:
#filtering for tweets that contain the words in my list. 
data1 = data[data['vocab'] == 1]

In [None]:
data1.head()

In [None]:
text1 = data1['text_2']

Next step would be to remove stop words.

In [None]:
#https://openclassrooms.com/en/courses/6532301-introduction-to-natural-language-processing/6980726-remove-stop-words-from-a-block-of-text

from collections import Counter
# # transform the text into a list of words
# words_list = text1.split(' ')
# define the list of words you want to remove from the text
stopwords = ['the', 'of', 'and', 'is','to','in','a','from','by','that', 'with', 'this', 'as', 'an', 'are','its', 'at', 'for']
# use a python list comprehension to remove the stopwords from words_list
words_without_stopwords = [ word for word in text1 if word not in stopwords ]


In [None]:
wordcloud1 = WordCloud().generate(" ".join(words_without_stopwords))

In [None]:
plt.imshow(wordcloud1, interpolation='bilinear')
plt.axis("off")
plt.show()

It is interesting that 'dark', 'body', 'short', 'ugly' are some of the most common words. 

Next, I want to create a bar graph showing the most common words.

In [None]:
#https://www.earthdatascience.org/courses/use-data-open-source-python/intro-to-apis/calculate-tweet-word-frequencies-in-python/
words_split = [tweet.lower().split() for tweet in words_without_stopwords]


In [None]:
import itertools
# List of all words across tweets
all_words = list(itertools.chain(*words_split))

In [None]:
import collections
from collections import Counter
word_count = collections.Counter(all_words)

word_count.most_common(15)

The stopwords that I use earlier did not quite do the trick. I am going to use the stopwords from nltk.

In [None]:
nltk.download('stopwords')

In [None]:
# #https://stackoverflow.com/questions/63018726/counter-and-plot-the-most-common-word-in-a-text
# tf = Counter(words_without_stopwords)

In [None]:
#import tweepy as tw
import nltk
from nltk.corpus import stopwords
import re
import networkx

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
tweets_nsw = [[word for word in tweet_words if not word in stop_words]
              for tweet_words in words_split]

In [None]:
all_words_nsw = list(itertools.chain(*tweets_nsw))

counts_nsw = collections.Counter(all_words_nsw)

counts_nsw.most_common(20)

We see that some of the words that can be used to body shame, such as 'short', 'body', 'fat', 'dark' is among the top 20 most common words.

In [None]:
clean_tweets_nsw = pd.DataFrame(counts_nsw.most_common(20),
                             columns=['words', 'count'])

fig, ax = plt.subplots(figsize=(8, 8))

# Plot horizontal bar graph
clean_tweets_nsw.sort_values(by='count').plot.barh(x='words',
                      y='count',
                      ax=ax,
                      color="purple")

ax.set_title("Common Words Found in Tweets related to body(Without Stop Words)")

plt.show()

In [None]:
small_data[0:5]

In [None]:
# !pip install punkt
# !pip install wordnet



In [None]:
#word cloud
#bar graph
#sentiment analysis: most common words 