In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import re

In [None]:
train_df = pd.read_csv('../input/multilabel-classification-dataset/train.csv')
test_df = pd.read_csv('../input/multilabel-classification-dataset/test.csv')

In [None]:
train_df.sample(5)

In [None]:
cols_target = ['Computer Science','Physics','Mathematics','Statistics','Quantitative Biology','Quantitative Finance']

In [None]:
train_df.describe()

In [None]:
unlabelled_in_all = train_df[(train_df['Computer Science']!=1) & (train_df['Physics']!=1) & (train_df['Mathematics']!=1) & 
                            (train_df['Statistics']!=1) & (train_df['Quantitative Biology']!=1) & (train_df['Quantitative Finance']!=1)]
print('Percentage of unlabelled comments is ', len(unlabelled_in_all)/len(train_df)*100)

In [None]:
# check for any 'null' abstract
no_comment = train_df[train_df['ABSTRACT'].isnull()]
len(no_comment)

In [None]:
test_df.head()

In [None]:
no_comment = test_df[test_df['ABSTRACT'].isnull()]
no_comment

In [None]:
# let's see the total rows in train, test data and the numbers for the various categories
print('Total rows in test is {}'.format(len(test_df)))
print('Total rows in train is {}'.format(len(train_df)))
print(train_df[cols_target].sum())

In [None]:
# Let's look at the character length for the rows in the training data and record these
train_df['char_length'] = train_df['ABSTRACT'].apply(lambda x: len(str(x)))

In [None]:
# look at the histogram plot for text length
sns.set()
train_df['char_length'].hist()
plt.show()

In [None]:
data = train_df[cols_target]

In [None]:
colormap = plt.cm.plasma
plt.figure(figsize=(7,7))
plt.title('Correlation of features & targets',y=1.05,size=14)
sns.heatmap(data.astype(float).corr(),linewidths=0.1,vmax=1.0,square=True,cmap=colormap,
           linecolor='white',annot=True)

In [None]:
test_df['char_length'] = test_df['ABSTRACT'].apply(lambda x: len(str(x)))

In [None]:
plt.figure()
plt.hist(test_df['char_length'])
plt.show()

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [None]:
# clean the comment_text in train_df [Thanks to Pulkit Jha for the useful pointer.]
train_df['ABSTRACT'] = train_df['ABSTRACT'].map(lambda com : clean_text(com))

In [None]:
# clean the comment_text in test_df [Thanks, Pulkit Jha.]
test_df['ABSTRACT'] = test_df['ABSTRACT'].map(lambda com : clean_text(com))

In [None]:
train_df = train_df.drop('char_length',axis=1)

In [None]:
X = train_df.ABSTRACT
test_X = test_df.ABSTRACT

In [None]:
print(X.shape, test_X.shape)

In [None]:
# import and instantiate TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(max_features=5000,stop_words='english')
vect

In [None]:
# learn the vocabulary in the training data, then use it to create a document-term matrix
X_dtm = vect.fit_transform(X)
# examine the document-term matrix created from X_train
X_dtm

In [None]:
# transform the test data using the earlier fitted vocabulary, into a document-term matrix
test_X_dtm = vect.transform(test_X)
# examine the document-term matrix from X_test
test_X_dtm

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
logreg = LogisticRegression(C=23.0)

# create submission file
submission_chains = pd.read_csv('../input/multilabel-classification-dataset/sample_submission.csv')

# create a function to add features
def add_feature(X, feature_to_add):
    '''
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    '''
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

In [None]:
for label in cols_target:
    print('... Processing {}'.format(label))
    y = train_df[label]
    # train the model using X_dtm & y
    logreg.fit(X_dtm,y)
    # compute the training accuracy
    y_pred_X = logreg.predict(X_dtm)
    print('Training Accuracy is {}'.format(accuracy_score(y,y_pred_X)))
    # make predictions from test_X
    test_y = logreg.predict(test_X_dtm)
    test_y_prob = logreg.predict_proba(test_X_dtm)[:,1]
    submission_chains[label] = test_y_prob
    # chain current label to X_dtm
    X_dtm = add_feature(X_dtm, y)
    print('Shape of X_dtm is now {}'.format(X_dtm.shape))
    # chain current label predictions to test_X_dtm
    test_X_dtm = add_feature(test_X_dtm, test_y)
    print('Shape of test_X_dtm is now {}'.format(test_X_dtm.shape))

In [None]:
submission_chains.head()

In [None]:
# generate submission file
submission_chains.to_csv('submission_chains.csv', index=False)