In [1]:
#Relevant libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split



In [2]:
#importing the data

df = pd.read_csv('../data/tweets_cleaned_eda.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,tweet_text,cyberbullying_type,cleaned_tweets,tweet_length
0,0,"In other words #katandandre, your food was cra...",not_cyberbullying,word katandandre food crapilicious mkr,61
1,1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying,aussietv white mkr theblock imacelebrityau tod...,115
2,2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying,classy whore red velvet cupcake,60
3,3,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying,isi account pretend kurdish account . like isl...,103
4,4,"@Raja5aab @Quickieleaks Yes, the test of god i...",not_cyberbullying,yes test god good bad indifferent weird whatev...,131


In [4]:
df['cyberbullying_type'].unique()

array(['not_cyberbullying', 'gender', 'religion', 'other_cyberbullying',
       'age', 'ethnicity'], dtype=object)

In [5]:
#Replacing categories with numeric category column

df['cyberbullying_type'].replace(['not_cyberbullying', 'gender', 'religion','age', 'ethnicity','other_cyberbullying'],
                        [0, 1, 2, 3, 4, 5], inplace=True)

df.head()

Unnamed: 0.1,Unnamed: 0,tweet_text,cyberbullying_type,cleaned_tweets,tweet_length
0,0,"In other words #katandandre, your food was cra...",0,word katandandre food crapilicious mkr,61
1,1,Why is #aussietv so white? #MKR #theblock #ImA...,0,aussietv white mkr theblock imacelebrityau tod...,115
2,2,@XochitlSuckkks a classy whore? Or more red ve...,0,classy whore red velvet cupcake,60
3,3,@RudhoeEnglish This is an ISIS account pretend...,0,isi account pretend kurdish account . like isl...,103
4,4,"@Raja5aab @Quickieleaks Yes, the test of god i...",0,yes test god good bad indifferent weird whatev...,131


In [6]:
#Removing NaN cleaned tweets
df[df['cleaned_tweets'].isna()]

Unnamed: 0.1,Unnamed: 0,tweet_text,cyberbullying_type,cleaned_tweets,tweet_length
377,377,@kylieminogue @muse,0,,19
1102,1102,@CanadianSpider how are you?,0,,28
1136,1136,@TakeAStand2Day,0,,15
2134,2134,@itvnews @mittromney @barackobama @robertmooreitv,0,,49
2173,2173,That is all,0,,11
3147,3147,@jarihimanen How so?,0,,20
3674,3674,Why?,0,,4
4600,4600,@TheyLOVEJayP or what,0,,21
4629,4629,@g56yu Who?,0,,12
5288,5288,Just NOW?!?!? 😄😃😀,0,,18


In [7]:
df = df.dropna()
df[df['cleaned_tweets'].isna()]

Unnamed: 0.1,Unnamed: 0,tweet_text,cyberbullying_type,cleaned_tweets,tweet_length


In [8]:
#Encoding the text data
tv = TfidfVectorizer(max_features=100, stop_words='english')
tweets_transformed = tv.fit_transform(df['cleaned_tweets'])
print(tweets_transformed)

  (0, 48)	0.629545521773713
  (0, 95)	0.776963600186433
  (1, 93)	0.715095195471068
  (1, 48)	0.699027082032016
  (3, 31)	0.6108032736491378
  (3, 39)	0.4077573526760115
  (3, 30)	0.678714448231469
  (4, 1)	0.347089083828579
  (4, 22)	0.3308809904720595
  (4, 21)	0.7827133225407708
  (4, 99)	0.39674524918195775
  (5, 7)	1.0
  (6, 48)	1.0
  (7, 70)	0.5664015264135237
  (7, 5)	0.8241294260469227
  (8, 27)	1.0
  (9, 27)	0.8096616982416556
  (9, 7)	0.586896868623814
  (10, 48)	1.0
  (11, 98)	0.5371600380588135
  (11, 8)	0.6273811827277398
  (11, 35)	0.5637835977321408
  (12, 42)	0.7592166780059856
  (12, 48)	0.6508379489838891
  (13, 7)	1.0
  :	:
  (44314, 52)	0.5045059559070055
  (44314, 13)	0.5129103505068486
  (44314, 17)	0.4783467649244328
  (44314, 39)	0.5035683521465877
  (44315, 56)	0.61137874139663
  (44315, 96)	0.6292377367897841
  (44315, 5)	0.4798707171394634
  (44316, 51)	0.583579834285283
  (44316, 94)	0.4171791496537471
  (44316, 46)	0.5340809636332885
  (44316, 93)	0.4473853

In [9]:
#Creating a dataframe and adding it to the original dataframe
tweets_transformed_df = pd.DataFrame(tweets_transformed.toarray(),
                     columns=tv.get_feature_names())\
                               .add_prefix('Counts_')




In [22]:
df_transformed = pd.concat([tweets_transformed_df.reset_index().drop(columns='index') , df['cyberbullying_type'].reset_index().drop(columns='index')],
                            axis=1, sort=False)
df_transformed.shape

(44320, 101)

In [23]:
#splitting into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(df_transformed.drop(columns='cyberbullying_type'), 
                                                    df_transformed.cyberbullying_type, test_size=0.3, 
                                                    random_state=47, stratify = df_transformed.cyberbullying_type)

In [24]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(31024, 100) (13296, 100) (31024,) (13296,)


In [25]:
df_transformed.head()

Unnamed: 0,Counts_ask,Counts_bad,Counts_believe,Counts_big,Counts_bitch,Counts_black,Counts_boy,Counts_bully,Counts_child,Counts_christian,...,Counts_watch,Counts_way,Counts_white,Counts_woman,Counts_word,Counts_work,Counts_world,Counts_year,Counts_yes,cyberbullying_type
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.776964,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.715095,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.347089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.396745,0


In [26]:
df_transformed.to_csv('../data/train_tweets.csv')