In [42]:
# import libraries for prepocessing,featuring,splitting dataset,training 
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm
from gensim.models.doc2vec import Doc2Vec,TaggedDocument
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV,learning_curve
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score,confusion_matrix,recall_score,classification_report

In [2]:
df=pd.read_json("dataset.json")   #read datasheet

In [3]:
# function for prepocessing of data
def prepocess(array):
    array=array.str.lower()            #lower all alphabets 
    array=array.str.replace('\d+','')   #removing digits
    punc_table = str.maketrans({key: None for key in string.punctuation})
    array=array.str.translate(punc_table)            #removing puntuations
    from nltk.corpus import stopwords 
    stop=set(stopwords.words('english'))            #removing stopwards
    snow=nltk.stem.SnowballStemmer('english')        #stemming of words
    array=array.apply(lambda sentance: ' '.join([snow.stem(word) for word in sentance.split() if word not in stop]))
    return array

In [79]:
#for undersampling the data 
indices={category:np.array(df[df['category']==category].index) for category in df['category'].unique() }
def undersample(indices):
    for category in indices.keys():
        if (df['category'].value_counts()[category]>3000):
            indices[category]=np.array(np.random.choice(indices[category],np.random.randint(1004,3000),replace=False))
    undersample=np.concatenate(list(indices.values()))
    undersample_data=df.iloc[undersample,:]
    return (undersample_data)

In [80]:
data=undersample(indices)
data.drop('id',axis=1)
y=list(data['category'].unique())
no=[i for i in range(0,41)]
dic={catg:i for (catg,i) in zip(y,no) }
Y=data['category'].apply(lambda x:dic[x] )
X_data=data['headline']+' '+data['short_description']

In [81]:
X=prepocess(X_data)
X=np.array(X)
Y=np.array(Y)

In [82]:
# split the dataset into train and test 
split = StratifiedShuffleSplit(n_splits=5, test_size=0.40)
for train_index, test_index in split.split(X, Y):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]

In [23]:
#class to fit the doc2vec model using training data and to transform the training data to give feature matrix
class Doc2VecModel():

    def __init__(self, dm=1, vector_size=200, window=1,min_count=1,workers=4):
        self.d2v_model = None
        self.vector_size = vector_size
        self.window = window
        self.dm = dm
        self.min_count=min_count
        self.workers=workers

    def fit(self, x_train, y=None):
        # Initialize model
        self.d2v_model = Doc2Vec(vector_size=self.vector_size, window=self.window, dm=self.dm,min_count=5,workers=self.workers)
        tagged_documents=[TaggedDocument(row.split(), [i])  for row,i in zip(x_train,range(len(x_train)))]
        # Build vocabulary
        self.d2v_model.build_vocab(tagged_documents)
        # Train model
        self.d2v_model.train(tagged_documents, total_examples=len(tagged_documents), epochs=10)
        return self

    def transform(self, x_train):
        train_set=np.array([self.d2v_model.infer_vector(row.split(),steps=10) for row in x_train])
        return train_set

    def fit_transform(self, x_train, y=None):
        self.fit(x_train)
        return self.transform(x_train)
    def set_params(self,**params):
        for param, value in params.items():
            setattr(self, param, value)
        return self



In [None]:
# function for plotting learning curve to visualize whether the model has high bias or high variance
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
  
    plt.figure(figsize=(12,8))
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

title = "Learning Curves (SVM, RBF kernel, $\gamma=0.001$)"

cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
estimator = svm.SVC(gamma=0.8,C=1)                     #support vector machine classifier
plot_learning_curve(estimator, title, feature_train,label_train , (0.7, 1.01), cv=cv, n_jobs=4)

plt.show()

In [25]:
# parameter tuning using gridsearchcv
param_grid = {'doc2vec__window': [1,2, 3],
              'doc2vec__dm': [0,1],
              'doc2vec__size': [i for i in range(400,1000,30)],
              'clf__C': [0.1*i for i in range(8,50,3)]
}

pipe_log = Pipeline([('doc2vec', Doc2VecModel()), ('clf', svm.SVC(gamma=1))])

svm_clf = GridSearchCV(pipe_log, 
                        param_grid=param_grid,
                        scoring="accuracy",
                        verbose=3,
                        n_jobs=-1,cv=5)
svm_clf.fit(x_train,y_train)
svm_clf.best_params_

In [84]:
# pipeline to run doc2vec and classifier 
pipe_log = Pipeline([('doc2vec', Doc2VecModel(min_count=7,vector_size=1000,workers=4,dm=1)), ('clf', svm.SVC(C=2,gamma=0.7,kernel='rbf'))])
pipe_log.fit(x_train,y_train)

Pipeline(memory=None,
     steps=[('doc2vec', <__main__.Doc2VecModel object at 0x000002954B53CCF8>), ('clf', SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.7, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [85]:
predicted=pipe_log.predict(x_test)
label_test=np.array(y_test)

In [None]:
cnf_matrix=confusion_matrix(label_test,predicted)
fig= plt.figure(figsize=(18,8))
sns.heatmap(cnf_matrix,cmap="coolwarm_r",annot=True,linewidths=0.5)
plt.title("Confusion_matrix")
plt.xlabel("Predicted_class")
plt.ylabel("Real class")
plt.show()

print("\n----------F1_score------------------------------------")
print(f1_score(label_test,predicted))

In [83]:
#-----------------------------------------------submission file---------------------------------------------------------------#


test=pd.read_json("datasetresult.json")
test_data=test['headline']+' '+test['short_description']
test_data=prepocess(test_data)
test_data=np.array(test_data)
predicted_data=pipe_log.predict(test_data)
catg_list=list(dic.keys())
val_list=list(dic.values())
prediction=[catg_list[val_list.index(data)] for data in predicted_data]
submission_data=pd.DataFrame({'id':test['id'],'category':prediction})

In [None]:
submission_data.to_json("submission.json",orient='records')