# Data Cleaning


In [65]:
!pip install contractions



In [66]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [67]:
import pandas as pd
import numpy as np
import re
import contractions

import nltk
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [68]:
cols = ["label","id", "date", "query", "user", "text"]
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Twitter Sentiment Classification/training.1600000.processed.noemoticon.csv", encoding="ISO-8859-1", names = cols)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   label   1600000 non-null  int64 
 1   id      1600000 non-null  int64 
 2   date    1600000 non-null  object
 3   query   1600000 non-null  object
 4   user    1600000 non-null  object
 5   text    1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [69]:
df.head()

Unnamed: 0,label,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [70]:
df.label.value_counts()

label
0    800000
4    800000
Name: count, dtype: int64

In [71]:
df.drop(labels = ["id", "date", "query", "user"], axis = 1, inplace = True)

In [72]:
df.head()

Unnamed: 0,label,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


### Negative Labels

In [73]:
df[df.label == 0].head(10)

Unnamed: 0,label,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
5,0,@Kwesidei not the whole crew
6,0,Need a hug
7,0,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,@Tatiana_K nope they didn't have it
9,0,@twittera que me muera ?


### Positive Labels

In [74]:
df[df.label == 4].head(10)

Unnamed: 0,label,text
800000,4,I LOVE @Health4UandPets u guys r the best!!
800001,4,im meeting up with one of my besties tonight! ...
800002,4,"@DaRealSunisaKim Thanks for the Twitter add, S..."
800003,4,Being sick can be really cheap when it hurts t...
800004,4,@LovesBrooklyn2 he has that effect on everyone
800005,4,@ProductOfFear You can tell him that I just bu...
800006,4,@r_keith_hill Thans for your response. Ihad al...
800007,4,"@KeepinUpWKris I am so jealous, hope you had a..."
800008,4,"@tommcfly ah, congrats mr fletcher for finally..."
800009,4,@e4VoIP I RESPONDED Stupid cat is helping me ...


### 1. Expanding Contractions
- This is done using `contractions` library.

In [75]:
def expand_contractions(text):
  return contractions.fix(text)

In [76]:
expand_contractions("I dont know where he'll go. Maybe he's going to Lindy's place?")

"I do not know where he will go. Maybe he is going to Lindy's place?"

### 2.Removing noise texts (stopwords, emails, hashtags, username mentions, links)

In [77]:
def remove_noise_texts(text):
  # Patterns of the expressions we want to remove
  # match sequence of non-whitespace characters (\S+) followed by "@" and another sequence of non-whitespace characters (\S+).
  email_re = r'\S+@\S+'
  # match any occurrence of "@" of "#" symbol followed by any sequence of non-whitespace characters (\S+)
  mention_hashtag_re = r'(@|#)\S+'
  link_re = r'((www\.\S+)|(https?://\S+))|(\S+\.com)'
  # matche same character as the first part (.)\1 repeated two or more times.
  repeating_re = r'(.)\1\1+'

  # Remove patterns in our textual data
  text = re.sub(repeating_re, r'\1', text)
  text = re.sub(email_re, '', text)
  text = re.sub(mention_hashtag_re, '', text)

  return text



In [78]:
remove_noise_texts("In twitter.com a setence can have an email email_123@abcd.nlp.com, it also mentions @someone_on_Twitter and has words with repeating last letter to express my feelingssssssssssss. Lastly #DontForgetTheHashTag")

'In twitter.com a setence can have an email  it also mentions  and has words with repeating last letter to express my feelings. Lastly '

### Stopwords removal

In [79]:
# Load nltk stopwords list.
# This should be loaded outside so that it does not get re-loaded every time we call the function.
stop_words = stopwords.words("english")

def remove_stopwords(text: str, neg_keep: bool = False):
  stop_word_list = [word for word in stop_words if word not in ['not', 'no', 'nor']] if neg_keep else stop_words

  # Filter out the words that are not in the list
  text = ' '.join([word for word in text.split() if word not in stop_word_list])
  return text


In [80]:
remove_stopwords("I do not like the food")

'I like food'

In [81]:
remove_stopwords("I do not like the food", neg_keep = True)

'I not like food'

### Removing non-alphabets (digits, special characters, punctuation marks)


In [82]:
def remove_non_alpha(text:str):
  # Pattern of the expression we want to remove
  non_alpha_digit_re = r'[^a-zA-Z]'

  # Remove patterns in out textual data
  text = re.sub(non_alpha_digit_re, r' ', text)
  return text

In [83]:
remove_non_alpha('I am 21 years old!!! This is the beginning- of a new chapter')

'I am    years old    This is the beginning  of a new chapter'

### Removing redundant spaces

In [84]:
def remove_spaces(text: str):
  multi_spaces_re = r'\s\s+'
  text = re.sub(multi_spaces_re, ' ', text)
  return text

In [85]:
remove_spaces("I am  removing      spaces.")

'I am removing spaces.'

### Removing short words
Removing short words help to reduce the vocabulary size

In [86]:
def remove_short_words(text: str):
  text = ' '.join(word for word in text.split() if len(word) > 1)
  return text

### Defining the data cleaning function

In [87]:
def data_cleaning(text: str, remove_all_stopword=False):
  text = text.lower()
  text = expand_contractions(text)
  text = remove_noise_texts(text)
  text = remove_non_alpha(text)
  text = remove_short_words(text)
  text = remove_spaces(text)
  if remove_all_stopword:
    text = remove_stopwords(text, neg_keep = True)

  return text.strip()


In [64]:
data_cleaning("Just got an email from prankster@troll.com, I think it's @Jordan's troll. He told me in the mail that he really don't like my dog, but he did like a cat. #IHateJordan", remove_all_stopword = True)

'got email think troll told mail really not like dog like cat'

In [88]:
df["cleaned_text"] = df["text"].apply(lambda row: data_cleaning(row, remove_all_stopword = True))

In [89]:
df.head()

Unnamed: 0,label,text,cleaned_text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",http twitpic com zl aw bummer shoulda got davi...
1,0,is upset that he can't update his Facebook by ...,upset cannot update facebook texting might cry...
2,0,@Kenichan I dived many times for the ball. Man...,dived many times ball managed save rest go bounds
3,0,my whole body feels itchy and like its on fire,whole body feels itchy like fire
4,0,"@nationwideclass no, it's not behaving at all....",no not behaving mad cannot see


In [99]:
# Remove rows with no characters
df_cleaned = df[df['cleaned_text'] != ''][['cleaned_text', 'label']]

In [100]:
df_cleaned.head()

Unnamed: 0,cleaned_text,label
0,http twitpic com zl aw bummer shoulda got davi...,0
1,upset cannot update facebook texting might cry...,0
2,dived many times ball managed save rest go bounds,0
3,whole body feels itchy like fire,0
4,no not behaving mad cannot see,0


### Saving the data
We can finally transform our data into a feature vector **X**, and a target vector **y**. Thet will be saved into a folder as numpy arrays.

In [104]:
X = df_cleaned["cleaned_text"].values
y = df_cleaned["label"].values

In [107]:
np.save("/content/drive/MyDrive/Colab Notebooks/Twitter Sentiment Classification/preprocessed_data/feature_vectors.npy", X)
np.save("/content/drive/MyDrive/Colab Notebooks/Twitter Sentiment Classification/preprocessed_data/target_vectors.npy", y)