In [1]:
 #Importing Libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_curve, precision_score, recall_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay,roc_auc_score, RocCurveDisplay,roc_curve
from sklearn.feature_extraction.text import CountVectorizer
from nltk.probability import FreqDist
import nltk
import string
import numpy as np
import string
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
train_df = pd.read_csv('/workspaces/codespaces-blank/news_classification/data/train.csv')
valid_df = pd.read_csv('/workspaces/codespaces-blank/news_classification/data/valid.csv')
train_df.head(5)

Unnamed: 0,ArticleId,Text,Category
0,1155,chancellor rallies labour voters gordon brown ...,politics
1,1980,india s maruti sees profits jump india s bigge...,business
2,386,ukip s secret weapon by any measure new york...,politics
3,1436,banker loses sexism claim a former executive a...,business
4,304,dallaglio eyeing lions tour place former engla...,sport


In [3]:
train_df.describe()

Unnamed: 0,ArticleId
count,1043.0
mean,1118.684564
std,644.332014
min,2.0
25%,569.5
50%,1106.0
75%,1684.0
max,2224.0


In [4]:
train_df.shape

(1043, 3)

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1043 entries, 0 to 1042
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1043 non-null   int64 
 1   Text       1043 non-null   object
 2   Category   1043 non-null   object
dtypes: int64(1), object(2)
memory usage: 24.6+ KB


In [6]:
train_df.dtypes

ArticleId     int64
Text         object
Category     object
dtype: object

In [7]:
train_df.isnull().sum()

ArticleId    0
Text         0
Category     0
dtype: int64

In [8]:
columns_list = train_df['Category'].unique().tolist()
columns_list

['politics', 'business', 'sport', 'entertainment', 'tech']

In [9]:
counts = train_df['Category'].value_counts()
for col in columns_list:
    col_sum = counts[col]
    print(f'Total no of {col}_is {col_sum}')

Total no of politics_is 196
Total no of business_is 235
Total no of sport_is 250
Total no of entertainment_is 178
Total no of tech_is 184


In [10]:
print(train_df['Text'][1])

print(train_df['Text'][10])

print(train_df['Text'][10])

print(train_df['Text'][400])


india s maruti sees profits jump india s biggest carmaker maruti has reported a sharp increase in quarterly profit after a booming economy and low interest rates boosted demand.  net profit surged 70% to 2.39bn rupees ($54.98m; £29.32m) in the last three months of 2004 compared with 1.41bn rupees a year earlier. total sales were 30.1bn rupees  up 27% from the same 2004 period. maruti accounts for half of india s domestic car sales  luring consumers with cheap  fuel-efficient vehicles.  demand in india also has been driven by the poor state of public transport and the very low level of car ownership  analysts said.  figures show that only eight people per thousand are car owners. maruti beat market expectations despite an increase in raw materials costs. the company  majority-owned by japan s suzuki  said an increase in steel and other raw material prices was partially offset by cost cutting. sales in the fiscal third quarter  including vans and utility vehicles  rose by 17.8% to 136.06

## CLEANING

In [11]:
train_df['Text'].str.upper()

0       CHANCELLOR RALLIES LABOUR VOTERS GORDON BROWN ...
1       INDIA S MARUTI SEES PROFITS JUMP INDIA S BIGGE...
2       UKIP S SECRET WEAPON  BY ANY MEASURE  NEW YORK...
3       BANKER LOSES SEXISM CLAIM A FORMER EXECUTIVE A...
4       DALLAGLIO EYEING LIONS TOUR PLACE FORMER ENGLA...
                              ...                        
1038    DOZENS HELD OVER ID FRAUD SITE TWENTY-EIGHT PE...
1039    COMMODORE FINDS NEW LEASE OF LIFE THE ONCE-FAM...
1040    FOCKERS KEEPS US BOX OFFICE LEAD FILM COMEDY M...
1041    ASTRAZENECA HIT BY DRUG FAILURE SHARES IN ANGL...
1042    GERMANY CALLS FOR EU REFORM GERMAN CHANCELLOR ...
Name: Text, Length: 1043, dtype: object

In [12]:
def remove_pun_nums(text):
    text.upper()
    text.replace('[{}]'.format(string.punctuation),'')
    text.replace('\d+', '',)
    return text

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if word not in stop_words]
    return text



In [13]:
train_df['Text'] = train_df['Text'].apply(remove_pun_nums)

In [15]:
train_df['Text'] = train_df['Text'].apply(remove_stopwords)

In [16]:
print(train_df['Text'][1])

print(train_df['Text'][10])

print(train_df['Text'][10])

print(train_df['Text'][400])


['n', ' ', ' ', 'r', 'u', ' ', 'e', 'e', ' ', 'p', 'r', 'f', ' ', 'j', 'u', 'p', ' ', 'n', ' ', ' ', 'b', 'g', 'g', 'e', ' ', 'c', 'r', 'k', 'e', 'r', ' ', 'r', 'u', ' ', 'h', ' ', 'r', 'e', 'p', 'r', 'e', ' ', ' ', 'h', 'r', 'p', ' ', 'n', 'c', 'r', 'e', 'e', ' ', 'n', ' ', 'q', 'u', 'r', 'e', 'r', 'l', ' ', 'p', 'r', 'f', ' ', 'f', 'e', 'r', ' ', ' ', 'b', 'n', 'g', ' ', 'e', 'c', 'n', ' ', 'n', ' ', 'l', 'w', ' ', 'n', 'e', 'r', 'e', ' ', 'r', 'e', ' ', 'b', 'e', ' ', 'e', 'n', '.', ' ', ' ', 'n', 'e', ' ', 'p', 'r', 'f', ' ', 'u', 'r', 'g', 'e', ' ', '7', '0', '%', ' ', ' ', '2', '.', '3', '9', 'b', 'n', ' ', 'r', 'u', 'p', 'e', 'e', ' ', '(', '$', '5', '4', '.', '9', '8', ';', ' ', '£', '2', '9', '.', '3', '2', ')', ' ', 'n', ' ', 'h', 'e', ' ', 'l', ' ', 'h', 'r', 'e', 'e', ' ', 'n', 'h', ' ', 'f', ' ', '2', '0', '0', '4', ' ', 'c', 'p', 'r', 'e', ' ', 'w', 'h', ' ', '1', '.', '4', '1', 'b', 'n', ' ', 'r', 'u', 'p', 'e', 'e', ' ', ' ', 'e', 'r', ' ', 'e', 'r', 'l', 'e', 'r', '.',