**Name:** SIVAKUMAR Srinivas

**EID:** ssivakuma2

**Kaggle Team Name:** Shaata

# CS4487 - Assignment 1 - YouTube Spam Classifier

## Final submission
In this file, put the code that generates your final Kaggle submission. It will be used to verify that your Kaggle submission is reproducible.

In [1]:
%matplotlib inline
import IPython.core.display         
# setup output image format (Chrome works best)
IPython.core.display.set_matplotlib_formats("svg")
import matplotlib.pyplot as plt
import matplotlib
from numpy import *
from sklearn import *
from scipy import stats
random.seed(100)
import csv
import pandas as pd



In [2]:
# function to write a CSV file for Kaggle submission
def write_csv_kaggle_sub(fname, Y):
    # fname = file name
    # Y is a list/array with class entries
    with open(fname, 'w') as f:
        writer = csv.writer(f)

        # header
        tmp = ['Id', 'Prediction']            
        writer.writerow(tmp)
    
        # add ID numbers for each Y, and usage if necessary
        for (i,y) in enumerate(Y):
            tmp2 = [(i+1), y]
            writer.writerow(tmp2)
        
        f.close()

In [3]:
# load the data with pandas
traindf = pd.read_csv('youtube_spam_train.csv')
testdf  = pd.read_csv('youtube_spam_test.csv')

# extract data into a dictionary

train = {}                                 ### training data
train['txt']   = list(traindf['TEXT'])     # comment text
train['class'] = list(traindf['CLASS'])    # class (0=not spam; 1=spam)
train['author'] = list(traindf['AUTHOR'])  # author name
train['video']  = list(traindf['VIDEO'])   # video ID
train['date']   = list(traindf['DATE'])    # date of the comment

test = {}                                  ### testing data
test['txt']   = list(testdf['TEXT'])
test['author'] = list(testdf['AUTHOR'])
test['video']  = list(testdf['VIDEO'])
test['date']    = list(testdf['DATE'])

print(len(train['txt']))
print(len(test['txt']))

1000
956


In [4]:
import nltk
import re
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def replace_url_in_doc(doc):
    doc = re.sub('(/^(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?$/)', 'URL', doc)
    return re.sub('(/^(www\.[^\s]+)|(https?://[^\s]+))','URL', doc) 

for i in range(len(traindf['TEXT'])):
    text = traindf['TEXT'][i]
    text = replace_url_in_doc(text)
    words = text.split()
    text = ''
    for j in words:
        if(text == ''):
            text += wordnet_lemmatizer.lemmatize(j.lower(),pos='v')
        else:
            text = text +' '+wordnet_lemmatizer.lemmatize(j.lower(),pos='v')
    traindf['TEXT'][i] = text 
    
for i in range(len(testdf['TEXT'])):
    text = testdf['TEXT'][i]
    text = replace_url_in_doc(text)
    words = text.split()
    text = ''
    for j in words:
        if(text == ''):
            text += wordnet_lemmatizer.lemmatize(j.lower(),pos='v')
        else:
            text = text +' '+wordnet_lemmatizer.lemmatize(j.lower(),pos='v')
    testdf['TEXT'][i] = text

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/elft18/ssivakuma2/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [5]:
from sklearn.base import TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator
import string
import numpy as np
class ColumnExtractor(BaseEstimator,TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xcols = X[self.cols]
        return Xcols
    
class ConvertToList(BaseEstimator,TransformerMixin):
    
    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xlist = X.values.tolist()
        return Xlist

class LengthTransformer(BaseEstimator,TransformerMixin):
    def fit(self, X, y=None):
        # stateless transformer
        return self
    def transform(self,X):
        #X is a list
        array = []
        for t in X:
            array.append(len(t))
        return np.array(array).reshape(-1,1)

class AuthorTransformer(BaseEstimator,TransformerMixin):
    def fit(self, X, y=None):
        # stateless transformer
        return self
    def transform(self,X):
        #X is a pandas df with two fields, 'TEXT' and 'AUTHOR'
        array = []
        AuthorList = list(X['AUTHOR'])
        CommentList = list(X['TEXT'])
        for i in range(len(AuthorList)):
            flag = True
            words = AuthorList[i].split()
            for word in words:
                if word in CommentList[i]:
                    flag = False
                    array.append(True)
                    break
            if flag:
                array.append(False)
        return np.array(array).reshape(-1,1)
class PunctuationTransformer(BaseEstimator,TransformerMixin):
    def fit(self, X, y=None):
        # stateless transformer
        return self
    def transform(self,X):
        count = lambda l1, l2: len(list(filter(lambda c: c in l2, l1)))
        array = []
        for t in X:
            array.append(count(t, string.punctuation))
        return np.array(array).reshape(-1,1)
        

In [6]:
from sklearn.pipeline import *
from sklearn.preprocessing import *
import warnings
from sklearn.exceptions import DataConversionWarning

warnings.filterwarnings(action='ignore', category=DataConversionWarning)
logregPipeline = Pipeline([
    ('features', FeatureUnion([
        ('text', Pipeline([
            ('extract',ColumnExtractor('TEXT')),
            ('convert',ConvertToList()),
            ('vectorizer', feature_extraction.text.CountVectorizer(stop_words='english', max_features=1400))
        ])),
        ('essay_length', Pipeline([
            ('extract',ColumnExtractor('TEXT')),
            ('convert',ConvertToList()),
            ('length',LengthTransformer()),
            ('scaler',StandardScaler())
        ])),
        ('author_name',  Pipeline([
            ('extract',ColumnExtractor(['TEXT','AUTHOR'])),
            ('author',AuthorTransformer())
        ])),
        ('punctuations',  Pipeline([
            ('extract',ColumnExtractor(['TEXT'])),
            ('convert',ConvertToList()),
            ('punctuations',PunctuationTransformer()),
            ('scaler',StandardScaler())
        ]))
    ])),
    ('class',linear_model.LogisticRegression(C=100))
])
paramgrid = {'class__C': logspace(-4,4,20),'features__text__vectorizer__max_features':linspace(500,5000,15,dtype = int),\
        'features__text__vectorizer__ngram_range':((1, 4),(1,2),(1,3),(1,1))}
logreg = model_selection.GridSearchCV(logregPipeline, paramgrid, cv=5,
                                      n_jobs = -1, verbose=1)
logreg.fit(traindf, traindf['CLASS'])


Fitting 5 folds for each of 1200 candidates, totalling 6000 fits


[Parallel(n_jobs=-1)]: Done 169 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 668 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 1368 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done 2268 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done 3368 tasks      | elapsed:   22.1s
[Parallel(n_jobs=-1)]: Done 4668 tasks      | elapsed:   30.2s
[Parallel(n_jobs=-1)]: Done 6000 out of 6000 | elapsed:   38.7s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('text', Pipeline(memory=None,
     steps=[('extract', ColumnExtractor(cols='TEXT')), ('convert', ConvertToList()), ('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'class__C': array([1.00000e-04, 2.63665e-04, 6.95193e-04, 1.83298e-03, 4.83293e-03,
       1.27427e-02, 3.35982e-02, 8.85867e-02, 2.33572e-01, 6.15848e-01,
       1.62378e+00, 4.28133e+00, 1.12884e+01, 2.97635e+01, 7.84760e+01,
       2.06914e+02, 5.45559e+02, 1.43845e+03, 3.79269e+03, 1...35, 4357, 4678, 5000]), 'features__text__vectorizer__ngram_range': ((1, 4), (1, 2), (1, 3), (1, 1))},
       pre_dispatch='2*n_jobs', refit=True, return_tr

In [7]:
print(logreg.best_score_)
print(logreg.best_params_)

0.944
{'class__C': 11.288378916846883, 'features__text__vectorizer__max_features': 500, 'features__text__vectorizer__ngram_range': (1, 4)}


In [8]:
predY = logreg.predict(testdf)
write_csv_kaggle_sub("final_submission.csv", predY)