In [68]:
 #Importing Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, precision_score, recall_score, accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay,roc_auc_score, RocCurveDisplay,roc_curve
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import nltk
import string
import numpy as np
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import time

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [69]:
train_df = pd.read_csv('/workspaces/codespaces-blank/news_classification/data/train.csv')
valid_df = pd.read_csv('/workspaces/codespaces-blank/news_classification/data/valid.csv')
train_df.head(5)

Unnamed: 0,ArticleId,Text,Category
0,1155,chancellor rallies labour voters gordon brown ...,politics
1,1980,india s maruti sees profits jump india s bigge...,business
2,386,ukip s secret weapon by any measure new york...,politics
3,1436,banker loses sexism claim a former executive a...,business
4,304,dallaglio eyeing lions tour place former engla...,sport


In [70]:
train_df.shape

(1043, 3)

In [71]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1043 entries, 0 to 1042
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1043 non-null   int64 
 1   Text       1043 non-null   object
 2   Category   1043 non-null   object
dtypes: int64(1), object(2)
memory usage: 24.6+ KB


In [72]:
train_df.dtypes

ArticleId     int64
Text         object
Category     object
dtype: object

In [73]:
train_df.isnull().sum()

ArticleId    0
Text         0
Category     0
dtype: int64

In [74]:
columns_list = train_df['Category'].unique().tolist()
columns_list

['politics', 'business', 'sport', 'entertainment', 'tech']

In [75]:
counts = train_df['Category'].value_counts()
for col in columns_list:
    col_sum = counts[col]
    print(f'Total no of {col}_is {col_sum}')

Total no of politics_is 196
Total no of business_is 235
Total no of sport_is 250
Total no of entertainment_is 178
Total no of tech_is 184


In [76]:
train_df.iloc[70,1]

'what now for kelly holmes  last april  kelly holmes spoke to the bbc sport website about her loneliness  her fight to stay fit and her decision not to contest both the 800m and 1500m at the olympics.  it just goes to show even the most meticulous and measured athletes cannot predict what fate has in store for them. four months later  holmes stormed to double olympic gold and has since been made a dame  won the bbc sport personality of the year and written a book whilst still finding time to coach aspiring athletes. with so much time spent in the spotlight  holmes has increasingly dropped hints that her ambition on the track has begun to wilt. and when asked about her plans for both the indoor and outdoor seasons ahead  the 34-year-old has repeatedly chosen to tick the  don t know  box. holmes has now pulled out of this weekend s european indoor championships  where she was selected for both the 800m and 1500m  because of a hamstring injury. but should we be surprised if the olympic ch

## CLEANING

In [77]:
def preprocessor(text):
    '''
    explain func
    '''
    text = text.lower() #Lowers  the text
    text = re.sub('<[^>]*>', '', text) # removes HTML Markup
    text = re.sub(r'[^a-zA-Z\s]', '', text) # removes special characters and digits
    stop_words = set(stopwords.words('english'))
    text = [word for word in text.split() if word not in stop_words and word not in string.punctuation] #removes stopwords
    text = ' '.join(word for word in text)
    return text

In [78]:
train_df['Text'] = train_df['Text'].apply(preprocessor)
train_df.loc[0,'Text']



In [79]:
#train_df['Category'] =  train_df['Category'].map({'politics': 0, 'business' :1 , 'sport': 2, 'entertainment' : 3, 'tech': 4})

In [80]:
x_train = train_df['Text']
y_train = train_df['Category']
tf_idf_vectorizer = TfidfVectorizer(max_df= 0.8, min_df= 10)
x_train_transform = tf_idf_vectorizer.fit_transform(x_train)
print(x_train_transform.shape)
tfidf_dataframe = pd.DataFrame(x_train_transform.toarray(), columns=tf_idf_vectorizer.get_feature_names_out())
tfidf_dataframe.head()

(1043, 3085)


Unnamed: 0,ability,able,abroad,absence,absolutely,abuse,academy,accept,accepted,access,...,years,yen,yes,yet,york,young,younger,yukos,zealand,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.023486,0.0,0.0,0.0,0.0,0.07663,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.035921,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.047386,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105149,0.0


In [81]:
valid_df['Text'] = valid_df['Text'].apply(preprocessor)
x_test = valid_df['Text']
y_test = valid_df['Category']
x_valid_transform = tf_idf_vectorizer.transform(x_test)
x_valid_transform.shape


(223, 3085)

### Modelling

In [82]:
def model_score(test, pred):
    print(f'Accuracy Score : {accuracy_score(test, pred)}')
    print(f'Accuracy Score : {f1 (test, pred)}')



In [83]:
from sklearn.svm import SVC
baseline_model = SVC(random_state= 10)
%time baseline_model.fit(x_train_transform, y_train)


CPU times: user 1.56 s, sys: 1.37 ms, total: 1.56 s
Wall time: 1.58 s


In [84]:
y_pred = baseline_model.predict(x_valid_transform)
y_pred[10:15]

array(['business', 'entertainment', 'business', 'entertainment',
       'entertainment'], dtype=object)

In [85]:
model_score(y_test, y_pred)

Accuracy Score : 0.968609865470852
