In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Read data files

In [2]:
traindf = pd.read_csv('data/train.csv')
testdf = pd.read_csv('data/test.csv')

### Check original data

In [3]:
traindf.shape, testdf.shape

((31962, 3), (17197, 2))

#### Drop Duplicates

In [4]:
traindf.drop_duplicates(inplace = True)

In [5]:
testdf.drop_duplicates(inplace = True)

In [6]:
traindf.shape, testdf.shape

((31962, 3), (17197, 2))

In [7]:
traindf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 998.8+ KB


In [8]:
traindf['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [9]:
traindf.head(3)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty


In [10]:
print(len(traindf))

31962


In [11]:
testdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17197 entries, 0 to 17196
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      17197 non-null  int64 
 1   tweet   17197 non-null  object
dtypes: int64(1), object(1)
memory usage: 403.1+ KB


In [12]:
testdf.head(3)

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...


### Cleaning tweets

In [13]:
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation 
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/umbertoleone/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/umbertoleone/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
#create empty list
corpus = []
# loop thru our train dataset 
for i in range (0, len(traindf)):
    tweet = traindf['tweet'][i]
    tweet = tweet.lower()
    tweet = re.sub('[^a-zA-Z]', ' ', tweet) #only alphabet
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
    tweet = re.sub('@[^\s]+', 'AT_USER',  tweet) # remove usernames
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
    tweet = tweet.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')+ list(punctuation) + ['AT_USER','URL', 'user']
    tweet = [ps.stem(word) for word in tweet if not word in set(all_stopwords)]
    tweet = ' '.join(tweet)
    corpus.append(tweet)


In [15]:
len(corpus)

31962

In [16]:
print(corpus)

['father dysfunct selfish drag kid dysfunct run', 'thank lyft credit use caus offer wheelchair van pdx disapoint getthank', 'bihday majesti', 'model love u take u time ur', 'factsguid societi motiv', 'huge fan fare big talk leav chao pay disput get allshowandnogo', 'camp tomorrow danni', 'next school year year exam think school exam hate imagin actorslif revolutionschool girl', 'love land allin cav champion cleveland clevelandcavali', 'welcom gr', 'ireland consum price index mom climb previou may blog silver gold forex', 'selfish orlando standwithorlando pulseshoot orlandoshoot biggerproblem selfish heabreak valu love', 'get see daddi today day gettingf', 'cnn call michigan middl school build wall chant tcot', 'comment australia opkillingbay seashepherd helpcovedolphin thecov helpcovedolphin', 'ouch junior angri got junior yugyoem omg', 'thank paner thank posit', 'retweet agre', 'friday smile around via ig cooki make peopl', 'know essenti oil made chemic', 'euro peopl blame ha conced g

In [17]:
traindf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 2.2+ MB


In [18]:
traindf['cleaned'] = np.array(corpus)

In [19]:
traindf.head(5)

Unnamed: 0,id,label,tweet,cleaned
0,1,0,@user when a father is dysfunctional and is s...,father dysfunct selfish drag kid dysfunct run
1,2,0,@user @user thanks for #lyft credit i can't us...,thank lyft credit use caus offer wheelchair va...
2,3,0,bihday your majesty,bihday majesti
3,4,0,#model i love u take with u all the time in ...,model love u take u time ur
4,5,0,factsguide: society now #motivation,factsguid societi motiv


In [20]:
train = traindf.drop(columns=['id', 'tweet'])
train.head()

Unnamed: 0,label,cleaned
0,0,father dysfunct selfish drag kid dysfunct run
1,0,thank lyft credit use caus offer wheelchair va...
2,0,bihday majesti
3,0,model love u take u time ur
4,0,factsguid societi motiv
