In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split,StratifiedKFold

from sklearn.svm import LinearSVC,SVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report,confusion_matrix
from imblearn.over_sampling import RandomOverSampler,SMOTE
import numpy as np

n_folds = 10
def imb_prepare(X, y):
    #  For SVM
    #sm = SMOTE(kind='svm', random_state=4991)
    # For non-SVM
    #sm = SMOTE(random_state=4991)
    oversample = RandomOverSampler()
    # fit and apply the transform
    X_over, y_over = oversample.fit_resample(X, y)
    #X_resampled, y_resampled = sm.fit_sample(X, y)
    return X_over, y_over



total_data_eng = pd.read_csv("Total_data_annotated_eng_clean_28thApril.csv")
X = np.array(total_data_eng['translated'])
y = np.array(total_data_eng['label'])

# This is for the train-test split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4991)
#print("Training set size : ", len(X_train), " ; Test set size : ", len(X_test))

max_feat_list = [500, 1000, 2000, 5000, 10000, 25000]

for max_feat_elem in max_feat_list:
    ngram_vectorizer = TfidfVectorizer(min_df=0.0025, max_df=0.25,ngram_range=(1,3), max_features=max_feat_elem, stop_words='english')
    kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=2020)

    acc_whl = []
    precision_whl = []
    recall_whl = []
    f1_whl = []
    print("-------------------------------------------------------------------")
    print("For max_features value : ", max_feat_elem)
    print("-------------------------------------------------------------------")

    for train, test in kf.split(X, y):
        clf = LogisticRegression()
        #clf = LinearSVC(random_state=4991)
        #clf = RandomForestClassifier(random_state=4991)
        #clf = AdaBoostClassifier(random_state=4991)
        #clf = GradientBoostingClassifier(random_state=4991)
        #clf = GaussianNB()
        clf = SVC(kernel="rbf", random_state=4991)
        trainX = X[train]
        trainY = y[train]
        testX = X[test]
        testY = y[test]
    
        tfidf_train = ngram_vectorizer.fit_transform(trainX).A
        tfidf_test = ngram_vectorizer.transform(testX).A
        #print("Training data shape : ", tfidf_train.shape, " ; Test data shape : ", tfidf_test.shape)

        tfidf_train = np.array(tfidf_train)
        tfidf_test = np.array(tfidf_test)
        y_train = np.array(trainY)
        y_test = np.array(testY)

        tfidf_train, y_train = imb_prepare(tfidf_train, y_train)
        clf.fit(tfidf_train, y_train)
        y_pred = clf.predict(tfidf_test)
        #print("Predicted : ", y_pred)
        #print("True labels : ", y_test)
        
        #print(classification_report(y_test, y_pred))

        tn, fp, fn, tp = confusion_matrix(testY, y_pred).ravel()
        accuracy = (tp+tn)*1.0/(tn+fp+fn+tp)
        precision = (tp*1.0)/(tp+fp)
        recall = (tp*1.0)/(tp+fn)
        f1_score = (2.0*precision*recall)/(precision+recall)
        acc_whl.append(accuracy)
        precision_whl.append(precision)
        recall_whl.append(recall)
        f1_whl.append(f1_score)

    print("Overall accuracy : ", sum(acc_whl) / n_folds)
    print("Overall precision : ", sum(precision_whl) / n_folds)
    print("Overall recall : ", sum(recall_whl) / n_folds)
    print("Overall f1-score : ", sum(f1_whl)/ n_folds)

In [17]:
total_data_eng = pd.read_csv("Total_data_annotated_eng_clean_28thApril.csv")


In [19]:
total_data_eng

Unnamed: 0,id,label,translated,eng_len
0,207102,0,* Jammu Kashmir: Encounter continues in Bu...,1143
1,135627,0,* It is a sign of maturity and deep understand...,3886
2,665173,0,The Indian Army carried out this operation in ...,1523
3,543553,0,A new drama has become popular in the name of ...,3864
4,231428,0,* Know what we all do not have to do at such...,4159
5,609443,0,.60000 people will stand in saf from the Kaaba...,1299
6,576275,0,Ghulam Nabi Azad did not just say that Ajit Do...,2127
7,499523,0,3 Why did Muhammad Marry Several W...,2430
8,664274,0,Shaktahin Mughal was the nominal ruler of Delh...,5296
9,715422,0,Doubt is getting deeper! In most of the cities...,3218
