In [3]:
import numpy as np
import pandas as pd
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
from google.colab import files
uploaded = files.upload()

Saving cleaned_am_nam_articles.csv to cleaned_am_nam_articles.csv


In [7]:
data = pd.read_csv('cleaned_am_nam_articles.csv')

In [8]:
X = data['article']
y = data['label']

In [None]:
documents = []
nltk.download('all')
from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents.append(document)

In [47]:

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()

In [48]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = tfidfconverter.fit_transform(documents).toarray()

In [76]:
np.random.seed(12345)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [95]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators =1100,criterion="entropy")
y_pred_rf = RF.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d"
      % (X_test.shape[0], (y_test != y_pred_rf).sum()))
print(f"Accuracy of RF is {RF.score(X_test, y_test)*100}%")

Number of mislabeled points out of a total 218 points : 17
Accuracy of RF is 92.20183486238533%


In [113]:
from xgboost import XGBClassifier
XGB = XGBClassifier(
     learning_rate =0.1,
     n_estimators=500,
     max_depth=5,
     min_child_weight=1,
     gamma=0,
     subsample=0.8,
     colsample_bytree=0.8,
     objective= 'binary:logistic',
     nthread=4,
     scale_pos_weight=1,
     reg_alpha=0.005,
     seed=27)
y_pred_xgb = XGB.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d"
      % (X_test.shape[0], (y_test != y_pred_xgb).sum()))
print(f"Accuracy of XGB is {XGB.score(X_test, y_test)*100}%")

Number of mislabeled points out of a total 218 points : 20
Accuracy of XGB is 90.82568807339449%


In [102]:
from sklearn.svm import SVC
import time
svc = SVC()
training_start = time.perf_counter()
svc.fit(X_train, y_train)
training_end = time.perf_counter()
prediction_start = time.perf_counter()
preds = svc.predict(X_test)
prediction_end = time.perf_counter()
acc_svc = (preds == y_test).sum().astype(float) / len(preds)*100
svc_train_time = training_end-training_start
svc_prediction_time = prediction_end-prediction_start
print("Scikit-Learn's Support Vector Machine Classifier's prediction accuracy is: %3.2f" % (acc_svc))
print("Time consumed for training: %4.3f seconds" % (svc_train_time))
print("Time consumed for prediction: %6.5f seconds" % (svc_prediction_time))

Scikit-Learn's Support Vector Machine Classifier's prediction accuracy is: 92.20
Time consumed for training: 1.784 seconds
Time consumed for prediction: 0.37321 seconds


In [119]:
from sklearn.ensemble import GradientBoostingClassifier
GBM = GradientBoostingClassifier(n_estimators=800, random_state=1, max_depth=4) 
y_pred_gbm = GBM.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d"
      % (X_test.shape[0], (y_test != y_pred_gbm).sum()))
print(f"Accuracy of GBM is {GBM.score(X_test, y_test)*100}%")

Number of mislabeled points out of a total 218 points : 21
Accuracy of GBM is 90.36697247706422%


In [122]:
from sklearn.ensemble import VotingClassifier
# set equal weights for each of the classifiers to reproduce the basic majority vote ensemble:
model_ensemble = VotingClassifier(estimators=[('rf', RF),('xgb',XGB),('svc',svc),('gbm',GBM)], voting='hard')


y_pred_ensemble = model_ensemble.fit(X_train, y_train).predict(X_test)

print(f"Accuracy of ensemble {model_ensemble.score(X_test, y_test)*100}%")

Accuracy of ensemble 92.20183486238533%


In [124]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred_ensemble))

              precision    recall  f1-score   support

          am       0.94      0.90      0.92       113
         nam       0.90      0.94      0.92       105

    accuracy                           0.92       218
   macro avg       0.92      0.92      0.92       218
weighted avg       0.92      0.92      0.92       218

