In [None]:
# Loading Data
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
import random
from sklearn.model_selection import train_test_split
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
import nltk
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import metrics
import seaborn as sn
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

df = pd.read_csv('Project1-Classification.csv')

# Seed random generator
np.random.seed(42)
random.seed(42)

# Split data into training and testing data
train, test = train_test_split(df[["full_text","root_label"]], test_size = 0.2)

# Clean Data
def clean(text):
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    texter = re.sub(r"<br />", " ", text)
    texter = re.sub(r"&quot;", "\"",texter)
    texter = re.sub('&#39;', "\"", texter)
    texter = re.sub('\n', " ", texter)
    texter = re.sub(' u '," you ", texter)
    texter = re.sub('`',"", texter)
    texter = re.sub(' +', ' ', texter)
    texter = re.sub(r"(!)\1+", r"!", texter)
    texter = re.sub(r"(\?)\1+", r"?", texter)
    texter = re.sub('&amp;', 'and', texter)
    texter = re.sub('\r', ' ',texter)
    clean = re.compile('<.*?>')
    texter = texter.encode('ascii', 'ignore').decode('ascii')
    texter = re.sub(clean, '', texter)
    if texter == "":
        texter = ""
    return texter

table = str.maketrans(dict.fromkeys(string.punctuation))

def clean_text(text, table):
    for i in list(text.index):
        text[i] = clean(text[i])
        text[i] = text[i].translate(table)
        text[i] = re.sub(r'\S*\d\S*','',text[i])
        text[i] = text[i].lower()
    return text

train.full_text=clean_text(train.full_text, table)
test.full_text=clean_text(test.full_text, table)

In [None]:
# Feature Extraction

# Create Lemmatizing function
wnl = WordNetLemmatizer()

def get_pos(tag): 
    pos_dict = {'JJ':'a', 'NN':'n', 'RB':'r', 'VB':'v'}
    if tag[1][:2] in list(pos_dict.keys()):
        return pos_dict[tag[1][:2]]
    else:
        return 'n'
    
def lemmatize_text(text): # lemmatize text
    tokens = nltk.word_tokenize(text)
    tags = pos_tag(tokens)
    return  [wnl.lemmatize(pair[0],get_pos(pair)) for pair in tags]

# Create Stemmer 
ps = nltk.stem.PorterStemmer()

def stem_text(text):
    tokens = nltk.word_tokenize(text)
    return [ps.stem(word) for word in tokens]

# transform categories into number (Climate=0, Sports=1)
Encoder = LabelEncoder()
train['binary_root_label']=Encoder.fit_transform(train['root_label'])
test['binary_root_label']=Encoder.fit_transform(test['root_label'])

# Reduce Dimensionality
nmf_5 = NMF(n_components = 5, init='random', random_state=42)
nmf_30 = NMF(n_components = 30, init='random', random_state=42)
nmf_80 = NMF(n_components = 80, init='random', random_state=42)
svd_5 = TruncatedSVD(n_components = 5, random_state=42)
svd_30 = TruncatedSVD(n_components = 30, random_state=42)
svd_80 = TruncatedSVD(n_components = 80, random_state=42)

# Classifiers
l1_logregression = LogisticRegression(penalty='l1',random_state=42, solver='saga', max_iter=100000, C=100)
l2_logregression = LogisticRegression(penalty='l2',random_state=42, solver='saga', max_iter=100000, C=1000)
best_svm = svm.SVC(kernel='linear', C=100000, random_state=42)
gnb = GaussianNB()

In [None]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([ # SVM with best gamma
    ('vect', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', NMF(n_components=5, init='random', random_state=42)), # dim reduction
    ('clf', svm.SVC(kernel='linear', C=1))])

param_grid = {'vect__min_df': (3,5), # set min_df
              'vect__analyzer': (lemmatize_text, stem_text),  ## add lemmatization and stemming
              'reduce_dim': (nmf_5, nmf_30, nmf_80, svd_5, svd_30, svd_80), # reduce dimensionality
              'clf': (l1_logregression, l2_logregression, best_svm, gnb)}

In [None]:
search = GridSearchCV(pipeline, param_grid, n_jobs=-1, cv=5)
search.fit(train.full_text, train['binary_root_label'])