 **1. Loading Data**

Methods to solve Multi_Label_Classification Problems
***https://www.analyticsvidhya.com/blog/2017/08/introduction-to-multi-label-classification/***

In [None]:
#importing the required libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#mount the drive to access information from drice
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#load the training and test data
train_data=pd.read_csv("/content/drive/My Drive/train.csv")
Predict_data=pd.read_csv("/content/drive/My Drive/test.csv")

In [None]:
#lets just take 14000(due to ram issues in colab) rows for trainin the model
train_data=(train_data.head(14000))
train_data.shape

(14000, 9)

In [None]:
#checking for missing values
missing_values_check = train_data.isnull().sum()
print(missing_values_check)

TITLE                   0
ABSTRACT                0
Computer Science        0
Physics                 0
Mathematics             0
Statistics              0
Quantitative Biology    0
Quantitative Finance    0
labels                  0
dtype: int64


***2.Data Preprocessing***

In [None]:
import re
def clean_data(sentence):
    """function that takes a sentence as an argument
    and returns it without any punctuations and numbers"""
    cleaned_sentence=sentence.lower()
    cleaned_sentence=re.sub("['!#$%&\'()*+,-./:;<=>?@[\]^_`{|}~']","",cleaned_sentence)
    cleaned_sentence=re.sub("[0123456789]","",cleaned_sentence)
    cleaned_sentence=cleaned_sentence.replace("\n","")
    cleaned_sentence=cleaned_sentence.strip()
    return cleaned_sentence

In [None]:
#lets just remove the column abstract and labels since we will be training our model on just title description
train_data=train_data.drop(["ABSTRACT","labels"],axis=1)
Predict_data=Predict_data.drop(["ABSTRACT"],axis=1)

In [None]:
#apply clean data function to labels in train_data and predict_data
train_data["TITLE"]=train_data["TITLE"].apply(clean_data)
Predict_data["TITLE"]=Predict_data["TITLE"].apply(clean_data)

In [None]:
#split the train_data to train and test the model
train, test = train_test_split(train_data, random_state=1, test_size=0.30, shuffle=True)
print(train.shape)
print(test.shape)

(9800, 7)
(4200, 7)


In [None]:
#apply TfidfVectorizer to all titles
#Convert a collection of raw documents to a matrix of TF-IDF features
train_text = train['TITLE']
test_text = test['TITLE']
Predict_data_text=Predict_data["TITLE"]
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
vectorizer.fit(train_text)
vectorizer.fit(test_text)
vectorizer.fit(Predict_data_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents='unicode',
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [None]:
x_train = vectorizer.transform(train_text)
y_train = train.drop(labels = ['TITLE'], axis=1)

x_test = vectorizer.transform(test_text)
y_test = test.drop(labels = ["TITLE"], axis=1)

Predict_data_text=vectorizer.transform(Predict_data_text)

***3.Building the model***

In [None]:
#using classifier chains
classifier = ClassifierChain(LogisticRegression())
# Training logistic regression model on train data
classifier.fit(x_train, y_train)
#predict
predictions = classifier.predict(x_test)

In [None]:
#check predictions is of what data type and change it to nd-arraay
print(type(predictions))
#convert predictions to predictions array type
predictions=predictions.toarray()

<class 'scipy.sparse.csc.csc_matrix'>


***4.Saving the model***

In [None]:
#saving the model to save weights
import pickle
save_model=pickle.dumps(classifier)

In [None]:
#checking the accuracy of model
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")

Accuracy =  0.555952380952381




In [None]:
def changeLabels(prediction):
    #list that stores labels of all features
    actualLabels=[]
    """changing 1's and 0's to actual labels(Computer Science,Physics,...)"""
    for pred in prediction:
        present=[]#stores the labels of a present pred
        if pred[0]==1: present.append("Computer Science")
        if pred[1]==1: present.append("Physics")
        if pred[2]==1: present.append("Mathematics")
        if pred[3]==1: present.append("Quantitative Biology")
        if pred[4]==1: present.append("Quantitative Finance")
        actualLabels.append(present)
    return actualLabels

In [None]:
p=changeLabels(predictions)
p[:10]

[['Physics'],
 ['Mathematics'],
 ['Computer Science'],
 ['Mathematics'],
 ['Mathematics'],
 ['Computer Science', 'Quantitative Biology'],
 ['Quantitative Biology'],
 ['Mathematics'],
 ['Mathematics'],
 ['Physics']]

In [None]:
#transforming the list of labels to comma seperated labels
def transform(label):
    #label...>predictions after changing 0's and 1's to list of labels
    finalLabels=[]
    for i in label:
        finalLabels.append(",".join(i))
    return finalLabels

In [None]:
afterChanging=transform(p)
afterChanging[:10]

['Physics',
 'Mathematics',
 'Computer Science',
 'Mathematics',
 'Mathematics',
 'Computer Science,Quantitative Biology',
 'Quantitative Biology',
 'Mathematics',
 'Mathematics',
 'Physics']

***5.Using the model to Predict the labels for TEST.CSV***

In [None]:
predictions_on_test_Matrix=classifier.predict(Predict_data_text)#returns a csc_matrix
predictions_on_test_asNumpyArray=predictions_on_test_Matrix.toarray()#convert into numpy array
predictions_on_test_labels=changeLabels(predictions_on_test_asNumpyArray)#convert 0's and 1's to labels
predictions_on_test_asStrings=transform(predictions_on_test_labels)#transform list of labels to strings

In [None]:
#check the prediction_on_test_asStrings
print(predictions_on_test_asStrings[:10])

['Mathematics', 'Physics', 'Quantitative Biology', 'Physics', 'Mathematics', 'Mathematics', 'Quantitative Biology', 'Physics', 'Computer Science', 'Mathematics']


***6.Make a dataframe of predictions and save it as a csv file***

In [None]:
#dictionary
from pandas import DataFrame
dict={"labels":predictions_on_test_asStrings}
dataframe=DataFrame(dict)

In [None]:
#dataframe
dataframe.head()
dataframe.to_csv("/content/drive/My Drive/predictions_on_test.csv",sep="\t",index=True)

***7.Using label power set***

In [None]:
from skmultilearn.problem_transform import LabelPowerset
classifier2 = LabelPowerset(LogisticRegression())
classifier2.fit(x_train, y_train)
predictions2 = classifier2.predict(x_test)
# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions2))
print("\n")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy =  0.5842857142857143




In [None]:
#use the second model to predict the labels of test.csv file
preds_second_model=classifier2.predict(Predict_data_text)
preds_second_model_numpyArray=preds_second_model.toarray()
preds_second_labels=changeLabels(preds_second_model_numpyArray)#convert 0's and 1's to labels
preds_second_asStrings=transform(preds_second_labels)#transform list of labels to strings

In [None]:
preds_second_asStrings[:10]

['Computer Science',
 'Physics',
 'Physics',
 'Physics',
 'Computer Science',
 'Mathematics',
 'Computer Science',
 'Physics',
 'Mathematics',
 'Mathematics']

In [None]:
#converting the labels to dataframe and save it as csv file
dict={"labels":preds_second_asStrings}
dataframe2=DataFrame(dict)
dataframe.to_csv("/content/drive/My Drive/predictions_on_test_second.csv",sep="\t",index=True)