In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
#linear algebra,data preprocessing,Csv files
import pandas as pd
import nltk
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


#for data cleaning
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import string

#for feature selection
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

#evalution metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

#for classification
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

#model selection
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [None]:
train_data = pd.read_csv('/kaggle/input/ag-news-classification-dataset/train.csv')
test_data = pd.read_csv('/kaggle/input/ag-news-classification-dataset/test.csv')

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
# I have changed the name of columns for the ease of use.

train_data['text'] = train_data['Title'] + ' ' + train_data['Description']
test_data['text'] = test_data['Title'] + ' ' + test_data['Description']  
train_data.head()


In [None]:
#dropping unnecessary columns

train_data = train_data.drop(columns=['Title', 'Description'])
test_data = test_data.drop(columns=['Title', 'Description'])
train_data.head()

In [None]:
#mapping category column with category index


categories = {1:'World News', 2:'Sports News', 3:'Business News', 4:'Science-Technology News'}

train_data['category'] = train_data['Class Index'].map(categories)
test_data['category'] = test_data['Class Index'].map(categories)

train_data = train_data.drop(columns=['Class Index'])
test_data = test_data.drop(columns=['Class Index'])

In [None]:
train_data.head()

**Data Visualization**

In [None]:
plt.style.use('ggplot')
category  = train_data['category'].value_counts()
plt.figure(figsize=(10,5))
sns.barplot(category.index, category.values, alpha=0.8)
plt.title('Frequency of each category of news(train data)')
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Category', fontsize=12)
plt.show()

In [None]:
plt.style.use('ggplot')
category  = test_data['category'].value_counts()
plt.figure(figsize=(10,5))
sns.barplot(category.index, category.values, alpha=0.8)
plt.title('Frequency of each category of news(test data)')
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Category', fontsize=12)
plt.show()

**Data Cleaning**

In [None]:
#removing punctuations


def remove_punc(text):
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

train_data['text'] = train_data['text'].apply(lambda x: remove_punc(x))
test_data['text'] = test_data['text'].apply(lambda x: remove_punc(x))

In [None]:
#normalizing case

def normalize(text):        
    lower_case = text.lower()
    tokens=word_tokenize(lower_case)
    return (" ".join(tokens)).strip()



train_data['text'] = train_data['text'].apply(lambda x: normalize(x))
test_data['text'] = test_data['text'].apply(lambda x: normalize(x))

In [None]:
#removing stop words

nltk_stop_words = nltk.corpus.stopwords.words('english')
def remove_stop(text):        
    word_list=[word for word in text.split() if word not in nltk_stop_words]
    return " ".join(word_list)

train_data['text'] = train_data['text'].apply(lambda x: remove_stop(x))
test_data['text'] = test_data['text'].apply(lambda x: remove_stop(x))

In [None]:
#lemmatizing 

lemmatizer = WordNetLemmatizer()
def lemma(text): 
    lemmas = [lemmatizer.lemmatize(word) for word in text.split()]
    return " ".join(lemmas)

train_data['text'] = train_data['text'].apply(lambda x: lemma(x))
test_data['text'] = test_data['text'].apply(lambda x: lemma(x))
print(train_data['text'])

**Feature Extraction**

In [None]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_data.text)
X_train_counts.shape

In [None]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

**Running Machine Learning Algorithm**

Multinomial Naives Bayes

In [None]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                    ('clf', MultinomialNB()),])

text_clf = text_clf.fit(train_data.text, train_data.category)
predicted_mnb = text_clf.predict(test_data.text)
np.mean(predicted_mnb == test_data.category)

In [None]:
classification_report(test_data.category,predicted_mnb)

In [None]:
confusion_matrix(test_data.category,predicted_mnb)

**SVM Algorithm**

In [None]:
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                     ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3,random_state=42)),])

text_clf_svm = text_clf_svm.fit(train_data.text, train_data.category)
predicted_svm = text_clf_svm.predict(test_data.text)
np.mean(predicted_svm == test_data.category)

In [None]:
classification_report(test_data.category,predicted_svm)

In [None]:
confusion_matrix(test_data.category,predicted_svm)

**Logistic Regression**

In [None]:
logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5,max_iter=10000)),
               ])
logreg=logreg.fit(train_data.text,train_data.category)
predicted_logreg = logreg.predict(test_data.text)
np.mean(predicted_logreg == test_data.category)

In [None]:
classification_report(test_data.category,predicted_logreg)

In [None]:
confusion_matrix(test_data.category,predicted_logreg)

**Grid SearchCV**

In [None]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-2, 1e-3),
 }

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(train_data.text, train_data.category)
predicted_gs = gs_clf.predict(test_data.text)
np.mean(predicted_gs == test_data.category)

In [None]:
classification_report(test_data.category,predicted_gs)

In [None]:
confusion_matrix(test_data.category,predicted_gs)