In [2]:
import pandas as pd
import math
import matplotlib.pyplot as plt
import numpy as np
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import spacy
import textdistance as td
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
import gensim



# Load data from csv

In [384]:
train_csv = "./Data/clean_reddit_02_01_18.csv"
train = pd.read_csv(train_csv, encoding="utf-8")
train.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,author,author_flair_css_class,author_flair_text,body,can_gild,controversiality,created_utc,distinguished,edited,gilded,is_submitter,permalink,score,stickied,subreddit,subreddit_type
0,YouthfulPhotographer,,,Welcome to generation void,True,0,1517440000.0,,False,0,False,/r/tumblr/comments/7uaobc/nihilism_across_gene...,7,False,tumblr,public
1,jasonklacour,,,Welcome,True,0,1517440000.0,,False,0,False,/r/pics/comments/7ude45/7_years_later_im_offic...,1,False,pics,public
2,Assassin2000,,,I'm 16 and the friend told me he was joking af...,True,0,1517440000.0,,False,0,True,/r/legaladvice/comments/7uegdc/took_a_awful_jo...,1,False,legaladvice,public
3,kawaiicicle,Employee,Assistant Manager,What? \nIt’s a niche rpg. Most rpg fans are ad...,True,0,1517440000.0,,False,0,False,/r/GameStop/comments/7u7mps/is_this_possible/d...,0,False,GameStop,public
4,recklessmaterialism,,,solid!,True,0,1517440000.0,,False,0,True,/r/Seattle/comments/7udrqf/in_town_for_only_a_...,-1,False,Seattle,public


# Remove corrupt rows and store as different .csv files

In [13]:
def is_number(num):
    if pd.isna(num):
        return True
    try:
        float(num)
        return True
    except ValueError:
        return False

def is_integer(num):
    if pd.isna(num):
        return True
    try:
        int(num)
        return True
    except ValueError:
        return False

def valid_name(name):
    name_regex = re.compile(r"\A[A-Za-z0-9][A-Za-z0-9_-]{1,20}\Z")
    return bool(name_regex.match(name)) or pd.isna(name)

def valid_body(body):
    return len(body.strip()) > 0 or pd.isna(body)

def is_boolean(boo):
    return (str(boo) in ['True', 'False']) or pd.isna(boo)

def valid_controversiality(controversiality):
    return (is_number(controversiality) and float(controversiality) <= 1 and float(controversiality) >= 0) or pd.isna(controversiality)

def valid_utc(utc):
    return is_number(utc) and (len(str(utc)) == 12 or len(str(utc)) == 10) or pd.isna(utc)

def valid_distinguished(distinguished):
    return (str(distinguished) in ['nan', 'moderator', 'admin', 'special']) or pd.isna(distinguished)

def valid_permalink(permalink):
    title_regex = re.compile(r"(comments)/[^/]*/(?P<title>[^/]*)")
    return title_regex.search(permalink).group('title') != None or pd.isna(permalink)

def valid_subreddit_type(subreddit_type):
    return str(subreddit_type) in ['public', 'restricted', 'user'] or pd.isna(subreddit_type)

In [213]:
def find_corrupt_rows(df):
    corrupt_rows = []
    for index, row in df.iterrows():
        
        is_invalid_feature_info = {'author': False, 'author_flair_css_class': False, 'author_flair_text': False, 'body': False,
       'can_gild': False, 'controversiality': False, 'created_utc': False, 'distinguished': False,
       'edited': False, 'gilded': False, 'is_submitter': False, 'permalink': False, 'score': False,
       'stickied': False, 'subreddit': False, 'subreddit_type': False}
        try:
            if not valid_name(row['author']):
                is_invalid_feature_info['author'] = True
            if not valid_body(row['body']):
                is_invalid_feature_info['body'] = True
            if not is_boolean(row['can_gild']) :
                is_invalid_feature_info['can_gild'] = True
            if not valid_controversiality(row['controversiality']):
                is_invalid_feature_info['controversiality'] = True
            if not valid_utc(row['created_utc']):
                is_invalid_feature_info['created_utc'] = True
            if not valid_distinguished(row['distinguished']):
                is_invalid_feature_info['distinguished'] = True
            if not is_boolean(row['edited']):
                is_invalid_feature_info['edited'] = True
            if not is_integer(row['gilded']):
                is_invalid_feature_info['gilded'] = True
            if not is_boolean(row['is_submitter']): 
                is_invalid_feature_info['is_submitter'] = True
            if not valid_permalink(row['permalink']): 
                is_invalid_feature_info['permalink'] = True
            if not is_integer(row['score']): 
                is_invalid_feature_info['score'] = True
            if not is_boolean(row['stickied']): 
                is_invalid_feature_info['stickied'] = True
            if not valid_name(row['subreddit']): 
                is_invalid_feature_info['subreddit'] = True
            if not valid_subreddit_type(row['subreddit_type']):
                is_invalid_feature_info['subreddit_type'] = True

            if any(list(is_invalid_feature_info.values())):
                corrupt_row_info = (index, is_invalid_feature_info)
                corrupt_rows.append(corrupt_row_info)
        except:
            corrupt_row_info = (index, is_invalid_feature_info)
            corrupt_rows.append(corrupt_row_info)
    return corrupt_rows

In [351]:
def get_valid_and_corrupt_df(df):
    corrupt_row_indices = find_corrupt_rows(df)
    print("Number of corrupt rows: ", len(corrupt_row_indices), "Number of valid rows: ", str(len(df)-len(corrupt_row_indices)))
    indices_to_drop, errors = zip(*corrupt_row_indices)
    indices_to_drop = [int(i) for i in indices_to_drop]
    corrupt_rows = df.iloc[indices_to_drop,:]
    valid_rows = df.copy()
    valid_rows = valid_rows.drop(df.index[indices_to_drop])
    return (valid_rows, corrupt_rows)

In [385]:
valid_train, invalid_train = get_valid_and_corrupt_df(train)

Number of corrupt rows:  308247 Number of valid rows:  2885987


In [386]:
valid_train.to_csv("reddit_train.csv", encoding='utf-8', index=False)
invalid_train.to_csv("invalid_reddit_train.csv", encoding='utf-8', index=False)

# Clean data further

Remove newlines from comment bodies, assert data types for features, replace na/null values

In [4]:
def enforce_data_types(df):
    original_data_types = {'author': str, 'author_flair_css_class': str, 'author_flair_text': str, 'body': str, 'can_gild': bool,
              'controversiality': float, 'created_utc': int, 'distinguished': str, 'edited': bool, 'gilded': int,
              'is_submitter': bool, 'permalink': str, 'score': int, 'stickied': bool, 'subreddit': str, 'subreddit_type':  str}
    return df.astype(original_data_types)

In [38]:
def fill_in_missing_values(df):
    avg_created_utc = np.mean([time for time in df['created_utc'] if not pd.isna(time)])
    avg_controversiality = np.mean([contro for contro in df['controversiality'] if not pd.isna(contro)])
    replacements = {'author':'', 'author_flair_css_class':'', 'author_flair_text':'', 'body':'',
                    'can_gild':True, 'controversiality': avg_controversiality, 'created_utc':avg_created_utc,
                    'distinguished':False, 'edited':False, 'gilded':False, 'is_submitter':False, 'permalink':'',
                    'score':0, 'stickied':False, 'subreddit':'', 'subreddit_type':'public'}
    return df.fillna(value=replacements)

In [6]:
def clean_bodies_df(df):
    df['body'] = pd.Series([str(body).replace('\n', ' ') for body in df['body']])
    return df

In [8]:
def clean(df):
    df = enforce_data_types(df)
    print("Data types: ", df.dtypes)
    df = fill_in_missing_values(df)
    df = clean_bodies_df(df)
    print("Columns with empty values: ", df.columns[df.isna().any()].tolist())
    return df

In [9]:
clean_valid_train = clean(valid_train.copy())

Data types:  author                     object
author_flair_css_class     object
author_flair_text          object
body                       object
can_gild                     bool
controversiality          float64
created_utc                 int32
distinguished              object
edited                       bool
gilded                      int32
is_submitter                 bool
permalink                  object
score                       int32
stickied                     bool
subreddit                  object
subreddit_type             object
dtype: object
Columns with empty values:  []


In [19]:
clean_valid_train.to_csv("reddit_train.csv", encoding='utf-8')

# Sentiment analysis

In [20]:
def analyze_sentiments(df):
    sentiment_analyzer = SentimentIntensityAnalyzer()
    positive = []
    neutral = []
    negative = []
    compound = []
    for text in df['body']:
        sentiment = sentiment_analyzer.polarity_scores(text)
        positive.append(sentiment['pos'])
        neutral.append(sentiment['neu'])
        negative.append(sentiment['neg'])
        compound.append(sentiment['compound'])
        if len(compound) % 500000 == 0:
            print(len(compound), "/", len(df['body']))
    df['positive_sentiment'] = pd.Series(positive)
    df['neutral_sentiment'] = pd.Series(neutral)
    df['negative_sentiment'] = pd.Series(negative)
    df['compound_sentiment'] = pd.Series(compound)
    return df

In [21]:
engineered_train = analyze_sentiments(clean_valid_train)

500000 / 2885987
1000000 / 2885987
1500000 / 2885987
2000000 / 2885987
2500000 / 2885987


# Extract post title from permalink

In [56]:
def extract_post_title(permalink):
    title_regex = re.compile(r"(comments)/[^/]*/(?P<title>[^/]*)")
    return title_regex.search(permalink).group('title').replace('_', ' ')

In [57]:
def extract_post_titles(df):
    df['post_title'] = pd.Series([extract_post_title(permalink) for permalink in df['permalink']])
    return df

In [58]:
engineered_train = extract_post_titles(engineered_train)
engineered_train.head()

Unnamed: 0,author,author_flair_css_class,author_flair_text,body,can_gild,controversiality,created_utc,distinguished,edited,gilded,...,permalink,score,stickied,subreddit,subreddit_type,positive_sentiment,neutral_sentiment,negative_sentiment,compound_sentiment,post_title
0,YouthfulPhotographer,,,Welcome to generation void,True,0.0,1517443200,,False,0,...,/r/tumblr/comments/7uaobc/nihilism_across_gene...,7,False,tumblr,public,0.5,0.5,0.0,0.4588,nihilism across generations
1,jasonklacour,,,Welcome,True,0.0,1517443200,,False,0,...,/r/pics/comments/7ude45/7_years_later_im_offic...,1,False,pics,public,1.0,0.0,0.0,0.4588,7 years later im officially an us citizen murica
2,Assassin2000,,,I'm 16 and the friend told me he was joking af...,True,0.0,1517443200,,False,0,...,/r/legaladvice/comments/7uegdc/took_a_awful_jo...,1,False,legaladvice,public,0.317,0.683,0.0,0.6249,took a awful joke seriously and now im in trouble
3,kawaiicicle,Employee,Assistant Manager,What? \r It’s a niche rpg. Most rpg fans are a...,True,0.0,1517443200,,False,0,...,/r/GameStop/comments/7u7mps/is_this_possible/d...,0,False,GameStop,public,0.0,1.0,0.0,0.0,is this possible
4,recklessmaterialism,,,solid!,True,0.0,1517443200,,False,0,...,/r/Seattle/comments/7udrqf/in_town_for_only_a_...,-1,False,Seattle,public,1.0,0.0,0.0,0.2244,in town for only a day whats the one place i must


# Subreddit mentions

In [53]:
def num_subreddit_mentions(body):
    try:
        return len(re.findall(r"(\A|[^/A-Za-z0-9])r/[^\s]+", body))
    except:
        return 0

In [54]:
def num_subreddit_mentions_df(df):
    df['num_subreddit_mentions'] = pd.Series([num_subreddit_mentions(body) for body in df['body']])
    return df

In [55]:
engineered_train = num_subreddit_mentions_df(engineered_train)

# User mentions

In [26]:
def num_user_mentions(body):
    return len(re.findall(r"(\A|[^/A-Za-z0-9])u/[^\s]+", body))

In [29]:
def num_user_mentions_df(df):
    df['num_user_mentions'] = pd.Series([num_user_mentions(body) for body in df['body']])
    return df

In [30]:
engineered_train = num_user_mentions_df(engineered_train)

# Has flair

In [59]:
engineered_train = fill_in_missing_values(engineered_train)

In [33]:
def has_flair_df(df):
    df['has_flair'] = pd.Series([(len(flair_a) + len(flair_b)) > 0 for flair_a, flair_b in zip(df['author_flair_css_class'].tolist(), df['author_flair_text'].tolist())])
    return df

In [61]:
engineered_train = has_flair_df(engineered_train)

# Link extraction

In [64]:
def number_of_outside_links(text):
    link_regex = re.compile(r"(?P<url>https?://[^\s]+)")
    all_matches = link_regex.findall(text)
    outside_matches = [match for match in all_matches if "reddit.com" not in match and "redd.it" not in match]
    return len(outside_matches)

In [66]:
def number_of_reddit_links(text):    
    link_regex = re.compile(r"(?P<url>https?://[^\s]+)")
    all_matches = link_regex.findall(text)
    reddit_matches = [match for match in all_matches if "reddit.com" in match or "redd.it" in match]
    return len(reddit_matches)

In [70]:
def get_links(body):
    link_regex = re.compile(r"(?P<url>https?://[^\s]+)")
    all_matches = link_regex.findall(body)
    return all_matches

In [68]:
def extract_links(df):
    df['num_outside_links'] = pd.Series([number_of_outside_links(body) for body in df['body']])
    df['num_reddit_links'] = pd.Series([number_of_reddit_links(body) for body in df['body']])
#     df['links'] = pd.Series([get_links(body) for body in df['body']])
    return df

In [71]:
engineered_train = extract_links(engineered_train)

# Your part

In [91]:
import textstat

#generating flesch_kincaid_readability
#vocabulary difficulty
def generatefkReadability(frame):
    comments=frame.body
    textrating=[]
    for comment in comments:
    textrating.append(textstat.text_standard(comment),float_output=True)
    return textrating

#for length of comment
def wordLengthGenerator(frame):
    comments=frame.body
    lengths=[]
    for comment in comments:
        lengths.append(len(comment))
    return lengths

#returns the ratio of uppercase to lower case letters
#returns 0 if length is zero
def upperCaseGenerator(frame):
    comments=frame.body
    ratios=[]
    for comment in comments:
        count = 0
        for c in comment:
            if(c.isupper()):
                count = count + 1
        if(len(comment)>0):
            ratioval=float(count/len(comment))
        else:
            ratioval=0
        ratios.append(ratioval)
    return ratios

#counts number of questions in a comment
#by counting number of question marks
def numberOfQuestionsInAComment(frame):
    comments=frame.body
    questions=[]
    for comment in comments:
        count=0
        for c in comment:
            if(c=='?'):
                count=count+1
        questions.append(count)
    return questions


#naive method of determining whether or
#not the title is a question
def isTitleAQuestion(frame):
    START_WORDS = ["who", "what", "when", "where", "why", "how", "is", "can", "does", "do",
                  "could","should","would","which","whose","whom","are"]
    permalinks=frame.permalink
    isQuestion=[]
    for permalink in permalinks:
        splitty=permalink.split('/')
        title=splitty[5]
        wordsInTitle=title.split('_')
        firstWord=wordsInTitle[0]
        if(firstWord in START_WORDS):
            isQuestion.append(1)
        else:
            isQuestion.append(0)
    
    return isQuestion

In [92]:
readability= generatefkReadability(engineered_train)
length=wordLengthGenerator(engineered_train)
uppercase=upperCaseGenerator(engineered_train)

NameError: name 'numberofQuestionsInAComment' is not defined

In [99]:
numq=numberOfQuestionsInAComment(engineered_train)
istitleq=isTitleAQuestion(engineered_train)

In [114]:
def fix_contractions(frame):
    #newthing=frame.copy()
    contraction_list={"arent":"aren't","cant":"can't","couldnt":"couldn't","didnt":"didn't","doesnt":"doesn't",
                 "dont":"don't","hadnt":"hadn't","hasnt":"hasn't","havent":"haven't","hed":"he'd","hes":"he's",
                 "id":"i'd","im":"i'm","ive":"i've","isnt":"isn't","lets":"let's","shouldnt":"shouldn't","thats":"that's",
                 "theres":"there's","theyd":"they'd","theyll":"they'll","theyre":"they're","theyve":"they've","theyre":"they're",
                  "wed":"we'd","weve":"we've","werent":"weren't","whatll":"what'll","whatre":"what're","whats":"what's",
                  "whatve":"what've","wheres":"where's","whos":"who's","wholl":"who'll","wont":"won't","wouldnt":"wouldn't",
                  "youd":"you'd","youll":"you'll","youre":"you're","youve":"you've"}
    titles=[]
    for fv in frame["post_title"]:
        titlewords=fv.split(' ')
        twordnew=""
        for word in titlewords:
            if word in contraction_list.keys():
                newword=contraction_list[word]
                twordnew+=(newword)+" "
            else:
                twordnew+= (word)+" "
        titles.append(twordnew[:-1])
    return titles

In [115]:
titlesfix=fix_contractions(engineered_train)
titlesfix=pd.DataFrame(titlesfix)
titlesfix.head()

Unnamed: 0,0
0,nihilism across generations
1,7 years later i'm officially an us citizen murica
2,took a awful joke seriously and now i'm in tro...
3,is this possible
4,in town for only a day what's the one place i ...


In [131]:
import wordsegment as ws
ws.load()
print(segment(ws.clean("workswithotherlanguages")))

['works', 'with', 'other', 'languages']


In [137]:
#add
titlesfix=pd.DataFrame(titlesfix) #segmented reddit titles
istitleq=pd.DataFrame(istitleq) #is the post title a question
numq=pd.DataFrame(numq) #number of questions in a comment
readability=pd.DataFrame(readability) #readability score
uppercase=pd.DataFrame(uppercase) #uppercase to lowercase ratio in post
length=pd.DataFrame(length) #length of comment

In [136]:
from wordsegment import load, segment
def segmentSubreddits(frame):
    subreddits=frame['subreddit']
    finalsb=[]
    ws.load()
    for i, sb in zip(range(len(subreddits)), subreddits):
        if i%25000 == 0:
            print(i/25000)
        if len(sb) != 0:
            segmented=segment(ws.clean(sb))
            stringsb=""
            for c in segmented:
                stringsb+=c+" "
            finalsb.append(stringsb[:-1])
        else:
            finalsb.append(sb)
    return finalsb

subreddits_fixed=pd.DataFrame(segmentSubreddits(engineered_train))
subreddits_fixed.head()


0.0
1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0
10.0
11.0
12.0
13.0
14.0
15.0
16.0
17.0
18.0
19.0
20.0
21.0
22.0
23.0
24.0
25.0
26.0
27.0
28.0
29.0
30.0
31.0
32.0
33.0
34.0
35.0
36.0
37.0
38.0
39.0
40.0
41.0
42.0
43.0
44.0
45.0
46.0
47.0
48.0
49.0
50.0
51.0
52.0
53.0
54.0
55.0
56.0
57.0
58.0
59.0
60.0
61.0
62.0
63.0
64.0
65.0
66.0
67.0
68.0
69.0
70.0
71.0
72.0
73.0
74.0
75.0
76.0
77.0
78.0
79.0
80.0
81.0
82.0
83.0
84.0
85.0
86.0
87.0
88.0
89.0
90.0
91.0
92.0
93.0
94.0
95.0
96.0
97.0
98.0
99.0
100.0
101.0
102.0
103.0
104.0
105.0
106.0
107.0
108.0
109.0
110.0
111.0
112.0
113.0
114.0
115.0


Unnamed: 0,0
0,tum blr
1,pics
2,legal advice
3,gamestop
4,seattle


In [123]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectKBest, chi2

from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification
from sklearn import discriminant_analysis
from imblearn.under_sampling import CondensedNearestNeighbour

from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import ClusterCentroids 

from sklearn.multioutput import MultiOutputClassifier

from sklearn.ensemble import ExtraTreesClassifier

from sklearn.decomposition import PCA, NMF
from sklearn.datasets import load_digits

from scipy import sparse
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix, hstack, vstack

from time import time
from pprint import pprint
import pickle

import re
from itertools import compress
from sklearn.utils import shuffle
from sklearn import metrics
from sklearn.utils.extmath import density
from sklearn.utils import resample
from collections import Counter

from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer

from sklearn.base import BaseEstimator

path = './input/'
#path = '../input/'
data = pd.read_csv(path+'train.csv')
submission_input=pd.DataFrame(enumerate(df['body'].tolist()))
#submission_input = pd.DataFrame()    #pd.read_csv(path+'test.csv')
print('Number of rows and columns in the train data set:',data.shape)
print('Number of rows and columns in the test data set:',submission_input.shape)

target_col = data.columns.values[range(2,8)]
targets = data.iloc[:,range(2,8)]
toxic_comment_data = data.loc[(targets==1).any(axis=1),:]

class multi_labels_LogisticRegression:
   
    def __init__(self,penalty='l2', dual=False, tol=1e-4, C=1.0,
                 fit_intercept=True, intercept_scaling=1, class_weight=None,
                 random_state=None, solver='liblinear', max_iter=100,
                 multi_class='ovr', verbose=0, warm_start=False, n_jobs=1,
                min_df = 2, max_df = 0.95, n_gram = 4, n_features = 2000,
                resampling = True, top_features = 1000,
                all_comments = False,
                num_class = 1, train_test_ratio = 0.8,
                analyzers = ['char','word'],
                fs = 'tree_50',
                estimator = 'nn',
                resampling_method = 'random',
                preprocess = True,
                fs_before_resampling = True):
        
        self.penalty = penalty
        self.dual = dual
        self.tol = tol
        self.C = C
        self.fit_intercept = fit_intercept
        self.intercept_scaling = intercept_scaling
        self.class_weight = class_weight
        self.random_state = random_state
        self.solver = solver
        self.max_iter = max_iter
        self.multi_class = multi_class
        self.verbose = verbose
        self.warm_start = warm_start
        self.n_jobs = n_jobs
        
        #self.word_vectorizer = []
        self.list_LogisticRegression = []
        self.list_label = []
        
        #Probability predictions of Training and Testing
        self.y_train_true = []
        self.y_train_scores = []
        self.y_test_true = []
        self.y_test_scores = []
        
        self.word_vectorizers = []
        #self.word_vectorizer_w = []
        self.path = './input/' 
        self.target_col = []
        self.list_topWord_Index = []
        self.train_test_ratio = train_test_ratio
        
        #Score of Training and Testing
        self.train_score = []
        self.test_score = []
        
        #Model training scorer
        self.train_scorer_ = []
        self.results = []
        
        # Word matrix conversion factors
        self.min_df = min_df
        self.max_df = max_df
        self.n_gram = n_gram
        self.n_features = n_features
        self.analyzers = analyzers
        
        self.resampling = resampling
        self.resampling_method = resampling_method
        self.top_features = top_features
        
        self.all_comments = all_comments
        self.num_class = num_class
        
        self.fs = fs
        self.estimator = estimator
        
        self.preprocess = preprocess
        self.fs_before_resampling = fs_before_resampling
        
    
    # Transforming comments list into word frequency matrix
    #def data_transform(self, comments_list, vectorizers_list = self.word_vectorizers):
    #    word_data_list = []
    #    for i,word_vector in enumerate(vectorizers_list):
    #        word_data_list = word_data_list + [word_vector.transform(comments_list)]
    #    
    #    return sparse.hstack(word_data_list)
    
    def data_transform(self, comments_list):
        word_data_list = []
        for i,word_vector in enumerate(self.word_vectorizers):
            word_data_list = word_data_list + [word_vector.transform(comments_list)]
        
        return sparse.hstack(word_data_list)
    
    # Using Forest Trees to select top words
    def features_selection(self, X,y):
        
        t0 = time()
        if 'tree' in self.fs:
            n = np.int(self.fs.split('_')[1])
            print("Processing features selection by %d Forest Trees:..." %n)

            forest = ExtraTreesClassifier(n_estimators=n,
                                  random_state=0, n_jobs=-1)

            forest.fit(X,y)

            #nb = MultinomialNB()
            #nb.fit(X,y)

            importances = forest.feature_importances_
            std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                         axis=0)
            indices = np.argsort(importances)[::-1]
            index_ = indices[range(self.top_features)]
            
        elif self.fs == 'sv':
            svc = SVC(kernel="linear")
            print("Processing features selection by Kernel:...")
            # The "accuracy" scoring is proportional to the number of correct
            # classifications
            rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(2),
                          scoring='accuracy', n_jobs=-1)
            rfecv.fit(X, y)

            print("Optimal number of features : %d" % rfecv.n_features_)

            # Plot number of features VS. cross-validation scores
            plt.figure()
            plt.xlabel("Number of features selected")
            plt.ylabel("Cross validation score (nb of correct classifications)")
            plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
            plt.show()

            index_ = rfecv.support_
            print("Number of features selected by SV: %d" %index_.sum())
            
        elif 'chi' in self.fs:
            n = np.int(self.fs.split('_')[1])
            print("Processing features selection by Chi2, k = %d:..." %n)
            ch2 = SelectKBest(chi2, k= n)
            X_train = ch2.fit_transform(X, y)
            
            index_ = ch2.get_support(indices=False)
            print("Number of features selected by Chi2: %d" %index_.sum())
            print(index_)
        else:
            index_ =  np.array([True]*X.shape[1])
        
        train_time = time() - t0
        print("&" * 80)
        print("Features Selection time: %0.3fs" % train_time)
        
        return index_
    
    # Resampling unbalanced dataset
    def under_resampling_func(self, X, y):
        t0 = time()
        print("Processing Resampling...")
        
        if 'random' in self.resampling_method:
            rus = RandomUnderSampler(random_state=1)
        else:
            rus = ClusterCentroids(random_state=1, n_jobs=-1)
        
        X_res, y_res = rus.fit_sample(X, y)

        train_time = time() - t0
        print("Resampling time: %0.3fs" % train_time)
            
        return X_res, y_res
    
    # Extracting dataset per "Target" class
    def train_test_target_split(self, data, toxic_class, fraction = 0.8):
        # Extracting data per "toxic_class"
        is_toxic = data[data[toxic_class]==1]
        no_toxic = data[~data.index.isin(is_toxic.index)]

        # Shuffling the data before taking samples
        is_toxic = shuffle(is_toxic)
        no_toxic = shuffle(no_toxic)

        ### Splitting data into Training and Testing

        # Toxic data
        training_toxic = is_toxic.sample(frac=fraction)
        testing_toxic = is_toxic.loc[~is_toxic.index.isin(training_toxic.index)]

        # Non-toxic data
        training_no_toxic = no_toxic.sample(frac=fraction)
        testing_no_toxic = no_toxic.loc[~no_toxic.index.isin(training_no_toxic.index)]

        # Training and Testing datasets
        data_training = shuffle(pd.concat([training_toxic, training_no_toxic], axis = 0))
        data_testing = shuffle(pd.concat([testing_toxic, testing_no_toxic], axis = 0))

        # Training and Testing 'comment_text' preditor, and target for "toxic" class
        y_training_toxic = data_training[toxic_class]
        y_testing_toxic = data_testing[toxic_class]

        X_training_comment = data_training['comment_text']
        X_testing_comment = data_testing['comment_text']
        
        # Generating Word_Vector for Training and Testing data
        #print("Extracting features from the training data using a sparse vectorizer for class: %s" %toxic_class)
        X_train = self.data_transform(X_training_comment)
        #print("n_samples: %d, n_features: %d" % X_train.shape)

        #print("Extracting features from the test data using the same vectorizer for class: %s" %toxic_class)
        X_test = self.data_transform(X_testing_comment)
        #print("n_samples: %d, n_features: %d" % X_test.shape)

        # ***** Targets of Training and Testing datasets
        y_train, y_test = y_training_toxic, y_testing_toxic

        return {'X_train':X_train, 'y_train':y_train ,'X_test':X_test, 'y_test': y_test}
    
    # Transforming Comment dataset into Words matrix dataset
    def words_matrix_convert(self, X, y):
        
        # Generating Word_Vector for Training and Testing data
        print("Converting Comments dataset to Words Matrix")
        X_ = self.data_transform(X)
        print("n_samples: %d, n_features: %d" % X_.shape)

        return X_, y
    
    def build_word_vectors(self, data):
        t0 = time()
        word_vectorizers = []
        for analyzer in self.analyzers:    
            print("Building words vector from the comments of the training data using a sparse vectorizer %s" % analyzer)
            word_vectorizer = TfidfVectorizer(min_df = self.min_df, max_df = self.max_df, lowercase=True, analyzer=analyzer,
                            stop_words= 'english',ngram_range=(1,self.n_gram),max_features=self.n_features)
            
            if self.all_comments:
                # Building word vector on the whole dataset
                word_vector = word_vectorizer.fit_transform(data['comment_text'])#(data['comment_text'])
            else:
                targets = data.iloc[:,range(2,8)]
                # Building word vector basing on Toxic comments only:
                word_vector = word_vectorizer.fit_transform(data.loc[(targets==1).any(axis=1),:]['comment_text'])
             
            word_vectorizers = word_vectorizers + [word_vectorizer]

        train_time = time() - t0
        print("Word Matrix training time: %0.3fs" % train_time)
        
        return word_vectorizers
    
    # Probability prediction
    def toxic_predict(self, comments_list):
        
        if self.word_vectorizers == []:
            print('The models are not yet trained!')
        else:
            X = self.data_transform(comments_list)
        
            y_pred = []
            for i,col in enumerate(self.target_col):
                
                if self.top_features > 0:
                    X_ = X.tocsc()[:,self.list_topWord_Index[i]]
                else:
                    X_ = X
               
                p = np.round(self.list_LogisticRegression[i].predict_proba(X_)[:,1],3)
                y_pred = y_pred + [p]

        return np.transpose(y_pred)
    
    def feature_selection_func(self, X_train, y_train, X_test, y_test):
        if self.top_features > 0: 
            topWord_Index = self.features_selection(X_train, y_train)
            self.list_topWord_Index = self.list_topWord_Index + [topWord_Index]
            
            X_train_, y_train_ = X_train.tocsc()[:,topWord_Index], y_train
            print("Feature reduced X, n_samples: %d, n_features: %d" % X_train_.shape)
            
            X_test_, y_test_ = X_test.tocsc()[:,topWord_Index], y_test
        else:
            X_train_, y_train_, X_test_, y_test_ = X_train, y_train, X_test, y_test
        
        return X_train_, y_train_, X_test_, y_test_
    
    def preprocess_function(self, X_train, y_train, X_test, y_test, fs_before_resampling = True):
        
        if fs_before_resampling:
            X_train_, y_train_, X_test_, y_test_ = self.feature_selection_func(X_train, y_train, X_test, y_test)
            if self.resampling: 
                X_train_, y_train_ = self.under_resampling_func(X_train_, y_train_)
                print("Resampled X_train, n_samples: %d, n_features: %d" % X_train_.shape)    
            else:
                print("No resampling...")
        else:
            if self.resampling:
                X_train_, y_train_ = self.under_resampling_func(X_train, y_train)
                print("Resampled X_train, n_samples: %d, n_features: %d" % X_train_.shape)
            else:
                print("No resampling...")   
            X_train_, y_train_, X_test_, y_test_ = self.feature_selection_func(X_train_, y_train_, X_test, y_test)
            
        return X_train_, y_train_, X_test_, y_test_
    
    def training_LogisticRegression(self, X_train, y_train, X_test, y_test): 
                                    #preprocess = True, fs_before_resampling = True):#, scores = ['recall']): #'precision']
       
        #### feature selection BEFORE RESAMPLING:
        #if self.top_features > 0:
            
        #    topWord_Index = self.features_selection(X_train, y_train)
        #    self.list_topWord_Index = self.list_topWord_Index + [topWord_Index]

            # Reducing Features
        #    X_train, y_train = X_train.tocsc()[:,topWord_Index], y_train
        #    print("Feature reduced X_train, n_samples: %d, n_features: %d" % X_train.shape)

        #    X_test, y_test = X_test.tocsc()[:,topWord_Index], y_test
        #    print("Feature reduced X_test, n_samples: %d, n_features: %d" % X_test.shape)
        
        
        #if self.resampling:
        #   X_train, y_train = self.under_resampling_func(X_train, y_train)
        #    print("Resampled X_train, n_samples: %d, n_features: %d" % X_train.shape)
        #else:
            #X_train, y_train = X_train, y_train
        #    print("No resampling...")
        
        
        #### feature selection AFTER RESAMPLING:
        
        
        
        
        #######

        t0 = time()
        #for score in scores:
        #print("# Tuning hyper-parameters for %s" % score)
        print("******** Beginning Training Process: %s *********" %self.estimator)
        #sv = GridSearchCV(SVC(), tuned_parameters, cv=5, scoring='%s_macro' % score)
        scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}
        
        if 'lr' in self.estimator:
            ######Fitting Logistic Regression *********************
            param_grid = {'C': [10, 20, 50, 100, 200, 500, 1000] }

            sv = GridSearchCV(LogisticRegression(intercept_scaling=1,
                        dual=False, fit_intercept=True, penalty='l2', tol=0.0001),
                              param_grid = param_grid, n_jobs=-1, scoring=scoring, cv=10, refit='AUC')#, scoring='%s_macro' % score) # cv=5,
        
        elif 'sv' in self.estimator:
            #### Fitting SVM*******************
            parameters = {
                'alpha': (0.00001, 0.000001),
                'penalty': ('l2', 'elasticnet'),
                'n_iter': (10, 50, 80),
            }
            sv = GridSearchCV(SGDClassifier(),
                               n_jobs=-1, param_grid = parameters, scoring=scoring, cv=10, refit='AUC')
        
        else:
            #self.estimator == 'nn':
            #####Fitting CNN********************
            params = {'hidden_layer_sizes': [(50,),(100,),(200,) ,(50,50,),(100,50,)]}
            #params = {'hidden_layer_sizes': [(100,)],
            #         'solver': ('lbfgs', 'adam')}
            mlp = MLPClassifier()
            sv = GridSearchCV(mlp, param_grid = params, verbose=10, n_jobs=-1, cv=5, scoring=scoring, refit='AUC')#
        
        sv.fit(X_train, y_train)
        
        train_time = time() - t0
        print("********* TRAINING Time: %0.3fs" % train_time)

        print("Best parameters set found on development set:")
        print()
        print(sv.best_params_)
        print()
        

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        y_true_test_sv, y_pred_test_sv = y_test, sv.predict(X_test)
        print(classification_report(y_true_test_sv, y_pred_test_sv))
        print()
        print("Confusion matrix:")
        print(metrics.confusion_matrix(y_true_test_sv, y_pred_test_sv))
            
        return sv, X_train, y_train, X_test, y_test
    
    # Fitting the models
    def fit(self, data):
        
        if self.word_vectorizers == []:
            self.word_vectorizers = self.build_word_vectors(data)
        
        self.target_col = data.columns.values[2:2+self.num_class]
        
        for i,col in enumerate(self.target_col):
            print('Building {} model for toxic class:{''}'.format(i,col)) 
    
            model_input = self.train_test_target_split(data, col, fraction = self.train_test_ratio)

            X_train, y_train  = model_input['X_train'], model_input['y_train'] 
            X_test, y_test = model_input['X_test'], model_input['y_test']
            
            print("FOR TRAINING - X_train, n_samples: %d, n_features: %d" % X_train.shape)
            print("FOR TESTING: X_test, n_samples: %d, n_features: %d" % X_test.shape)
            
            if self.preprocess:
                X_train, y_train, X_test, y_test = self.preprocess_function(X_train, y_train, 
                                                                            X_test, y_test, 
                                                                            self.fs_before_resampling)

            sv, X_train, y_train, X_test, y_test = self.training_LogisticRegression(X_train, y_train, X_test, y_test)

            self.list_label = self.list_label + [col]
            self.list_LogisticRegression = self.list_LogisticRegression + [sv]
            
            self.y_train_true = self.y_train_true + [y_train]
            self.y_train_scores = self.y_train_scores + [sv.predict_proba(X_train)]
            
            self.y_test_true = self.y_test_true + [y_test]
            self.y_test_scores = self.y_test_scores + [sv.predict_proba(X_test)]
            
            self.train_score = self.train_score + [sv.score(X_train, y_train)]
            self.test_score = self.test_score + [sv.score(X_test, y_test)]
            
            if self.estimator != 'nn':
                self.train_scorer_ = self.train_scorer_ + [sv.scorer_]
                self.results = self.results + [sv.cv_results_]

            print('=' * 80)
            
    def print_roc_curve(self):
        
        for i,col in enumerate(self.target_col):
            fpr_test, tpr_test, _ = roc_curve(self.y_test_true[i], self.y_test_scores[i][:,1])
            fpr_train, tpr_train, _ = roc_curve(self.y_train_true[i], self.y_train_scores[i][:,1])

            roc_auc_test = auc(fpr_test, tpr_test)
            roc_auc_train = auc(fpr_train, tpr_train)

            plt.figure()
            lw = 2
            plt.plot(fpr_test, tpr_test, color='darkorange',
                     lw=lw, label='Test ROC curve (area = %0.2f)' % roc_auc_test)
            plt.plot(fpr_train, tpr_train, color='blue',
                     lw=lw, label='Train ROC curve (area = %0.2f)' % roc_auc_train)
            plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.05])
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('ROC curve for toxic comment of label: {''}' .format(col))
            plt.legend(loc="lower right")
            plt.show()
    
    def print_grid_search(self):
        scoring, results = self.list_LogisticRegression[0].scorer_, self.list_LogisticRegression[0].cv_results_
        
        plt.figure(figsize=(13, 13))
        plt.title("GridSearchCV evaluating using multiple scorers simultaneously",
                  fontsize=16)

        plt.xlabel("C param")
        plt.ylabel("Score")
        plt.grid()

        ax = plt.axes()
        ax.set_xlim(0, 402)
        ax.set_ylim(0.73, 1)

        # Get the regular numpy array from the MaskedArray
        X_axis = np.array(results['param_C'].data, dtype=float)

        for scorer, color in zip(sorted(scoring), ['g', 'k']):
            for sample, style in (('train', '--'), ('test', '-')):
                sample_score_mean = results['mean_%s_%s' % (sample, scorer)]
                sample_score_std = results['std_%s_%s' % (sample, scorer)]
                ax.fill_between(X_axis, sample_score_mean - sample_score_std,
                                sample_score_mean + sample_score_std,
                                alpha=0.1 if sample == 'test' else 0, color=color)
                ax.plot(X_axis, sample_score_mean, style, color=color,
                        alpha=1 if sample == 'test' else 0.7,
                        label="%s (%s)" % (scorer, sample))

            best_index = np.nonzero(results['rank_test_%s' % scorer] == 1)[0][0]
            best_score = results['mean_test_%s' % scorer][best_index]

            # Plot a dotted vertical line at the best score for that scorer marked by x
            ax.plot([X_axis[best_index], ] * 2, [0, best_score],
                    linestyle='-.', color=color, marker='x', markeredgewidth=3, ms=8)

            # Annotate the best score for that scorer
            ax.annotate("%0.2f" % best_score,
                        (X_axis[best_index], best_score + 0.005))

        plt.legend(loc="best")
        plt.grid('off')
        plt.show()
        
    def submission_output(self, submission_input):
        Y_submit_pred = pd.DataFrame(self.toxic_predict(submission_input['comment_text']))
        Y_submit_pred.columns = np.array(self.target_col)
        submission_output = pd.concat([submission_input['id'], Y_submit_pred],axis=1)

        return submission_output
    
    def to_submission_file(self, submission_output, filename = 'submission.csv'):
        submission_output.to_csv(self.path+filename, index=False)
        
        #range(1), num_class = 1
submission_data = []
list_LR = []
for i in range(1):
    print("Training Process %s: " %i)
    print("%"*80)
    multi_labels_LR = multi_labels_LogisticRegression(min_df = 2, max_df = 0.95, n_gram = 5, n_features = 10000,
                resampling = True, top_features = 5000,
                all_comments = False,
                num_class = 1, analyzers = ['char', 'word'], estimator = 'lr', fs = 'tree_25')
    multi_labels_LR.fit(data)
    submission_data = submission_data + [multi_labels_LR.submission_output(submission_input)]
    list_LR = list_LR + [multi_labels_LR]
    
df_concat = pd.concat(submission_data)
by_row_index = df_concat.groupby([df_concat.index, df_concat.id])
df_means = by_row_index.mean()
df_means = df_means.reset_index('id')

toxic_comments=pd.DataFrame(df_means)
#df_means.to_csv(path+'submission.csv', index=False)

ImportError: cannot import name '_joblib_parallel_args'

In [None]:
toxic_comments.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
def build_dictionary(frame):
    corpus=frame['body']
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(corpus)
    return vectorizer.get_feature_names()

dict123=build_dictionary(engineered_train)

# Adding fixed post title, whether title is a question, the number of questions in the comment, readability, ratio of uppercase to lowercase, and length of comment

In [142]:
engineered_train['post_title'] = titlesfix[0]
engineered_train['title_is_question'] = istitleq[0]
engineered_train['num_questions'] = numq[0]
engineered_train['readability'] = readability[0]
engineered_train['uppercase_ratio'] = uppercase[0]
engineered_train['length'] = length[0]

In [24]:
engineered_train['subreddit_sep'] = subreddits_fixed[0]

NameError: name 'subreddits_fixed' is not defined

# Doc2Vec

In [4]:
from gensim.test.utils import common_corpus, common_texts, get_tmpfile
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models import Word2Vec

class EpochSaver(CallbackAny2Vec):

    def __init__(self, path_prefix):
        self.path_prefix = path_prefix
        self.epoch = 0

    def on_epoch_end(self, model):
#         output_path = get_tmpfile('{}_epoch{}.model'.format(self.path_prefix, self.epoch))
#         model.save(output_path)
        model.save('newest_d2v_model.model')
        self.epoch += 1

class EpochLogger(CallbackAny2Vec):

    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))
        print(datetime.datetime.now())

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        print(datetime.datetime.now())
        self.epoch += 1

In [5]:
d2v_model = Doc2Vec.load("final_d2v_model.model")

In [6]:
embeddings = []
for i in range(len(d2v_model.docvecs)):
    embeddings.append(d2v_model.docvecs[i])

In [15]:
embedding_df = pd.DataFrame()
embedding_num = 0
for positional_embedding in zip(*embeddings):
    col_name = "d2v_{0}".format(embedding_num)
    embedding_df[col_name] = pd.Series(positional_embedding)
    embedding_num += 1
    if embedding_num % 3 == 0:
        print(embedding_num/3)

1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0
10.0
11.0
12.0
13.0
14.0
15.0
16.0
17.0
18.0
19.0
20.0
21.0
22.0
23.0
24.0
25.0
26.0
27.0
28.0
29.0
30.0
31.0
32.0
33.0


MemoryError: 

In [17]:
embedding_df.to_csv("d2v_train_doc_vecs.csv", index=False)

In [18]:
small_d2v_model = Doc2Vec.load("newest_small_d2v_model.model")

In [19]:
small_embeddings = []
for i in range(len(small_d2v_model.docvecs)):
    small_embeddings.append(small_d2v_model.docvecs[i])

In [21]:
small_embedding_df = pd.DataFrame()
embedding_num = 0
for positional_embedding in zip(*small_embeddings):
    col_name = "d2v_{0}".format(embedding_num)
    small_embedding_df[col_name] = pd.Series(positional_embedding)
    embedding_num += 1
    if embedding_num % 3 == 0:
        print(embedding_num/3)

1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0
10.0
11.0
12.0
13.0
14.0
15.0
16.0


In [22]:
small_embedding_df.to_csv("small_d2v_train_doc_vecs.csv", index=False)

In [27]:
post_title_similarity = []
for i, row in engineered_train.iterrows():
    post_title_similarity.append(similarity_unseen_docs(model, row['body'], row['post_title'], alpha=0.1, min_alpha=0.0001, steps=5))
    if i % 25000 == 0:
        print(i)

MemoryError: 

# Correct Types

In [None]:
data_types = {'author': str, 'author_flair_css_class': str, 'author_flair_text': str, 'body': str, 'can_gild': bool,
              'controversiality': float, 'created_utc': int, 'distinguished': str, 'edited': bool, 'gilded': int,
              'is_submitter': bool, 'permalink': str, 'score': int, 'stickied': bool, 'subreddit': str, 'subreddit_type':  str, 
              'num_outside_links': int, 'num_reddit_links': int, 'positive_sentiment': float, 'neutral_sentiment': float,
              'negative_sentiment': float, 'compound_sentiment': float, 'named_entity_ratio': float, 'post_title': str, 
              'relevance_to_title': float, 'subreddit_mentions': int, 'user_mentions': int, 'has_flair': bool, 'links': str}

# Save and drop unnecessary features before training them separately

In [None]:
# train = train.drop(['author', author_flair_css_class', 'author_flair_text', 'body', 'permalink', 'subreddit'] <- for training

In [145]:
engineered_train.to_csv("engineered_reddit_train.csv", encoding='utf-8', index=False)