In [None]:
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Import Data
data_path = "/Users/Mesrop/Programming/ML/Dat264 Challenge/WBChallenge/train_values.csv"
data_raw = pd.read_csv(data_path, error_bad_lines=False)
print("Number of rows in data =",data_raw.shape[0])
print("Number of columns in data =",data_raw.shape[1])
print("\n")
print("**Sample data:**")
print(data_raw.dtypes)
data_raw.dropna(inplace = True)
data_raw['row_id']=pd.to_numeric(data_raw['row_id'], errors='coerce')
data_raw.dropna(inplace = True)
data_raw["row_id"]= data_raw["row_id"].astype(int)
print(data_raw.dtypes)

data_raw.head(100)





In [None]:
#Import Labels
label_path = "/Users/Mesrop/Programming/ML/Dat264 Challenge/WBChallenge/train_labels.csv"
label_raw = pd.read_csv(label_path, error_bad_lines=False)
print("Number of rows in label =",label_raw.shape[0])
print("Number of columns in label =",label_raw.shape[1])
print("\n")
print("**Sample label:**")
print(label_raw.dtypes)
label_raw.head()

In [None]:
merge_data=pd.merge(data_raw, label_raw, on='row_id',how='inner')

print("Number of rows in merge =",merge_data.shape[0])
print("Number of columns in  =",merge_data.shape[1])
print("\n")
print("**Sample label:**")
#print(merge_data.dtypes)
merge_data.head(100)

In [None]:
missing_values_check = merge_data.isnull().sum()
print(missing_values_check)

In [None]:
categories = list(merge_data.columns.values)
categories = categories[2:]
print(categories)

In [None]:
# Calculating number of doc in each category

counts = []
for category in categories:
    counts.append((category, merge_data[category].sum()))
df_stats = pd.DataFrame(counts, columns=['category', 'number of doc'])
df_stats

In [None]:
sns.set(font_scale = 2)
plt.figure(figsize=(15,8))

ax= sns.barplot(categories, merge_data.iloc[:,2:].sum().values)

plt.title("Doc in each category", fontsize=24)
plt.ylabel('Number of Doc', fontsize=18)
plt.xlabel('Doc Type ', fontsize=18)

#adding the text labels
rects = ax.patches
labels = merge_data.iloc[:,2:].sum().values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom', fontsize=18)

plt.show()

In [None]:
rowSums = merge_data.iloc[:,2:].sum(axis=1)
multiLabel_counts = rowSums.value_counts()
multiLabel_counts = multiLabel_counts.iloc[1:]

sns.set(font_scale = 2)
plt.figure(figsize=(15,8))

ax = sns.barplot(multiLabel_counts.index, multiLabel_counts.values)

plt.title("Doc having multiple labels ")
plt.ylabel('Number of Doc', fontsize=18)
plt.xlabel('Number of labels', fontsize=18)

#adding the text labels
rects = ax.patches
labels = multiLabel_counts.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')

plt.show()

In [None]:
#Data cleaning

data = merge_data
data = merge_data.loc[np.random.choice(merge_data.index, size=2000)]
data.shape
data.head()

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re

import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    
#nltk.download()

In [None]:
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext


def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned


def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

In [None]:
data['doc_text'] = data['doc_text'].str.lower()
data['doc_text'] = data['doc_text'].apply(cleanHtml)
data['doc_text'] = data['doc_text'].apply(cleanPunc)
data['doc_text'] = data['doc_text'].apply(keepAlpha)
data.head()

In [None]:
#removing stop words
stop_words = set(stopwords.words('english'))
stop_words.update(['zero','one','two','three','four','five','six','seven','eight','nine','ten','may','also','across','among','beside','however','yet','within'])
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)

data['doc_text'] = data['doc_text'].apply(removeStopWords)
data.head()

In [None]:
#Stemming
stemmer = SnowballStemmer("english")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

data['doc_text'] = data['doc_text'].apply(stemming)
data.head()

In [None]:
#Split Train Test
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, random_state=42, test_size=0.30, shuffle=True)

print(train.shape)
print(test.shape)

train_text = train['doc_text']
test_text = test['doc_text']

In [None]:
#TF IDF
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
vectorizer.fit(train_text)
vectorizer.fit(test_text)


x_train = vectorizer.transform(train_text)
y_train = train.drop(labels = ['row_id','doc_text'], axis=1)

x_test = vectorizer.transform(test_text)
y_test = test.drop(labels = ['row_id','doc_text'], axis=1)

In [None]:
#Multiple Binary Classifications - (One Vs Rest Classifier)
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.multiclass import OneVsRestClassifier

In [None]:
%%time

# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),
            ])

for category in categories:
    print('**Processing {} comments...**'.format(category))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(x_train, train[category])
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test)
    print('F1Score {}'.format(f1_score(test[category], prediction, average='micro')))
    print("\n")

In [19]:
%%time
#Multiple Binary Classifications - (Binary Relevance)

# using binary relevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB

# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print('F1Score {}'.format(f1_score(test[category], prediction, average='micro')))
print("\n")

In [None]:
# Classifier chains
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression

In [None]:

%%time

# initialize classifier chains multi-label classifier
classifier = ClassifierChain(LogisticRegression())

# Training logistic regression model on train data
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print('F1Score {}'.format(f1_score(test[category], prediction, average='micro')))
print("\n")

In [None]:
# Label Powerset
from skmultilearn.problem_transform import LabelPowerset

In [None]:
%%time

# initialize label powerset multi-label classifier
classifier = LabelPowerset(LogisticRegression())

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print('F1Score {}'.format(f1_score(test[category], prediction, average='micro')))
print("\n")