In [112]:
#import all the necessary libraries required
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [113]:
#reading training and testing datasets using pandas
df = pd.read_csv("/content/drive/MyDrive/AG Sentence Classification Task/train.csv" , header = None) 
df_test = pd.read_csv("/content/drive/MyDrive/AG Sentence Classification Task/test.csv" , header = None)

In [116]:
#mapping labels to their respective classes
labels_to_class_mapping = {1 : "World" , 2 : "Sports" ,3 : "Business" , 4 : "Sci/Tech"}

In [122]:

#function to map labels
def labels_mapping(l):
  classes = []
  for i in range(0 , len(l)):
    classes.append(labels_to_class_mapping[l[i]])
  return classes
#initializing tokenizer
tokenizer=RegexpTokenizer(r'\w+')

#Getting stop words
en_stopwords=set(stopwords.words('english'))

#initializing stemming Object
ps=PorterStemmer()

#function to clean data using stemming algorithm
def getCleanarticle(article):
    #Tokenize
    tokens=tokenizer.tokenize(article)
    new_tokens=[token for token in tokens if token not in  en_stopwords]
    stemmed_tokens=[ps.stem(token) for token in new_tokens]
    clean_article=' '.join(stemmed_tokens)
    return clean_article

In [123]:
from nltk.stem import WordNetLemmatizer 
#function to clean data using lemmatization algorithm
lemmatizer = WordNetLemmatizer() 
def cleanreview_lemmatize(article):
   tokens=tokenizer.tokenize(article)
   new_tokens=[token for token in tokens if token not in  en_stopwords]
   lemmatize_tokens=[lemmatizer.lemmatize(token) for token in new_tokens]
   clean_article=' '.join(lemmatize_tokens)
   return clean_article

In [125]:
# df[2].apply(getCleanarticle)
# df_test[2].apply(getCleanarticle)

nltk.download('wordnet')
#cleaning training and testing datasets
df[2].apply(cleanreview_lemmatize)
df_test[2].apply(cleanreview_lemmatize)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


0       Unions representing worker Turner Newall say d...
1       SPACE com TORONTO Canada A second team rockete...
2       AP A company founded chemistry researcher Univ...
3       AP It barely dawn Mike Fitzpatrick start shift...
4       AP Southern California smog fighting agency we...
                              ...                        
7595    Ukrainian presidential candidate Viktor Yushch...
7596    With supply attractive pitching option dwindli...
7597    Like Roger Clemens almost exactly eight year e...
7598    SINGAPORE Doctors United States warned painkil...
7599    EBay plan buy apartment home rental service Re...
Name: 2, Length: 7600, dtype: object

In [126]:
#splitting training and testing dataset
X_train =df.loc[:, 2].values
y_train =df.loc[:, 0].values
X_test =df_test.loc[:, 2].values
y_test =df_test.loc[:, 0].values

In [127]:
y_train = labels_mapping(y_train)
y_test = labels_mapping(y_test)

In [130]:
labels = ["World", "Sports" ,"Business" ,"Sci/Tech"]

In [131]:
#initializing vectorization 
vectorizer = TfidfVectorizer(sublinear_tf=True, encoding='utf-8',
 decode_error='ignore')
#Creating NLP pipeline model using Multinomial Naive Bayes Classification Algorithm
nb = Pipeline([('vect', vectorizer),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

# %%time
from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=labels))
print(y_pred)

accuracy 0.8960526315789473
              precision    recall  f1-score   support

       World       0.86      0.85      0.86      1900
      Sports       0.86      0.88      0.87      1900
    Business       0.95      0.97      0.96      1900
    Sci/Tech       0.91      0.88      0.90      1900

    accuracy                           0.90      7600
   macro avg       0.90      0.90      0.90      7600
weighted avg       0.90      0.90      0.90      7600

['Business' 'Sci/Tech' 'Sci/Tech' ... 'Sports' 'Business' 'Business']


In [132]:
from sklearn.linear_model import SGDClassifier
#Creating NLP pipeline model using Linear Support Machine Algorithm
sgd = Pipeline([('vect', vectorizer),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

# %%time

y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=labels))

accuracy 0.8793421052631579
              precision    recall  f1-score   support

       World       0.86      0.82      0.84      1900
      Sports       0.87      0.83      0.85      1900
    Business       0.89      0.98      0.93      1900
    Sci/Tech       0.90      0.88      0.89      1900

    accuracy                           0.88      7600
   macro avg       0.88      0.88      0.88      7600
weighted avg       0.88      0.88      0.88      7600



In [133]:
from sklearn.linear_model import LogisticRegression
#Creating NLP pipeline model using Logistic Regression Algorithm
logreg = Pipeline([('vect', vectorizer),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5 , max_iter=600 , solver="saga")),
               ])
logreg.fit(X_train, y_train)

# %%time

y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=labels))



accuracy 0.8664473684210526
              precision    recall  f1-score   support

       World       0.82      0.81      0.81      1900
      Sports       0.82      0.85      0.84      1900
    Business       0.94      0.94      0.94      1900
    Sci/Tech       0.88      0.86      0.87      1900

    accuracy                           0.87      7600
   macro avg       0.87      0.87      0.87      7600
weighted avg       0.87      0.87      0.87      7600

