### Tokenization and Removing Stopwords

In [1]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [2]:
rawData = pd.read_csv("/Users/skhiearth/Desktop/Reddit-Flair-Detection/RedditData/Data/raw.csv")
rawData.head()

rawData["flair"].fillna("", inplace = True)
rawData["body"].fillna("", inplace = True)
rawData["title"].fillna("", inplace = True)

rawData["text"] = rawData["title"] + " " + rawData["body"]
rawData.head(2)

Unnamed: 0,title,id,url,body,flair,text
0,Will donate thrice the number of upvotes (amou...,981o7s,https://www.reddit.com/r/india/comments/981o7s...,>**Note**: If you want to know what this is al...,[R]eddiquette,Will donate thrice the number of upvotes (amou...
1,Indian reply to NYtimes cartoon on Paris clima...,6f10op,http://imgur.com/a/U48v9,,/r/all,Indian reply to NYtimes cartoon on Paris clima...


In [3]:
bad_chars = ['/r', 'r/', '[R]'] 
for i in bad_chars : 
    rawData['flair'] = rawData['flair'].str.replace(i, '')

In [4]:
def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct

rawData['text'] = rawData['text'].apply(lambda x:remove_punctuation(x))
rawData['flair'] = rawData['flair'].apply(lambda x:remove_punctuation(x))

rawData.head(2)

Unnamed: 0,title,id,url,body,flair,text
0,Will donate thrice the number of upvotes (amou...,981o7s,https://www.reddit.com/r/india/comments/981o7s...,>**Note**: If you want to know what this is al...,eddiquette,Will donate thrice the number of upvotes amoun...
1,Indian reply to NYtimes cartoon on Paris clima...,6f10op,http://imgur.com/a/U48v9,,all,Indian reply to NYtimes cartoon on Paris clima...


In [5]:
# tokenizer = RegexpTokenizer(r'\w+')
# rawData['text'] = rawData['text'].apply(lambda x: tokenizer.tokenize(x.lower()))
# rawData['flair'] = rawData['flair'].apply(lambda x: tokenizer.tokenize(x.lower()))

# def remove_stopwords(text):
#     words = [w for w in text if w not in stopwords.words('english')]
#     return words

# rawData['text'] = rawData['text'].apply(lambda x:remove_stopwords(x))
# rawData['flair'] = rawData['flair'].apply(lambda x:remove_stopwords(x))

### Train-Test Split

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(rawData['text'], rawData['flair'], 
                                                    test_size=0.2, random_state=100)

### Vectorization

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(stop_words='english').fit(X_train)

tfidf_train = vect.transform(X_train)
tfidf_test = vect.transform(X_test)

X_train_vect = pd.DataFrame(tfidf_train.todense(), columns=vect.get_feature_names())
X_test_vect = pd.DataFrame(tfidf_test.todense(), columns=vect.get_feature_names())

# Used for fitting on the whole model later
total_vect = TfidfVectorizer(stop_words='english').fit(rawData['text']) 
tfidf = total_vect.transform(rawData['text']) 
rawData_vect = pd.DataFrame(tfidf.todense(), columns=total_vect.get_feature_names())

Unnamed: 0,005,02,02mbps,03012020,0315am,07,10,100,1000,10000,...,zero,zindabad,zoanthids,zoe,zojila,zomato,zomatos,zoo,zoya,µgkg
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029909,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
983,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
985,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
986,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
print("X_train_vect has {} rows and {} columns and y_train also has {} rows."
      .format(X_train_vect.shape[0], X_train_vect.shape[1], y_train.shape[0]))

print("X_test_vect has {} rows and {} columns and y_train also has {} rows."
      .format(X_test_vect.shape[0], X_test_vect.shape[1], y_test.shape[0]))

X_train_vect has 790 rows and 5737 columns and y_train also has 790 rows.
X_test_vect has 198 rows and 5737 columns and y_train also has 198 rows.
