# Importing Libraries and Data Retrieval


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#importing dataset
dataset = pd.read_csv("/kaggle/input/intelligence-sig-NLP-Task/news_train.csv")
test_set = pd.read_csv("/kaggle/input/intelligence-sig-NLP-Task/test.csv")

In [3]:
#printing to show the dataset
dataset.head()

Unnamed: 0,ID,News_title,News_headline,Category
0,1,"Do men enjoy sex more, or women? The Mahabhara...","[Book Extract] From Anushasana Parva, translat...",Arts
1,2,Why you should eat the Demonetisation laddoo,One laddoo equals to one lakh in your Jan Dhan...,humour
2,3,Is the world headed for a new Cold War?,The battle lines have become very clear with R...,politics
3,4,"Demonetisation is all about Modi, either you'r...",How many times should this hypocritical drama ...,business
4,5,Why electoral bonds won't clean up political f...,Union finance minister Arun Jaitley needs to p...,politics


In [4]:
#shape of dataset
dataset.shape

(15576, 4)

In [5]:
# Mapping the categories to their corresponding IDs
category_to_id = {
    'Arts': 0,
    'business': 1,
    'humour': 2,
    'politics': 3,
    'sports': 4,
    'tech': 5
}
dataset['Category'] = dataset['Category'].map(category_to_id)

dataset.head(10)

Unnamed: 0,ID,News_title,News_headline,Category
0,1,"Do men enjoy sex more, or women? The Mahabhara...","[Book Extract] From Anushasana Parva, translat...",0
1,2,Why you should eat the Demonetisation laddoo,One laddoo equals to one lakh in your Jan Dhan...,2
2,3,Is the world headed for a new Cold War?,The battle lines have become very clear with R...,3
3,4,"Demonetisation is all about Modi, either you'r...",How many times should this hypocritical drama ...,1
4,5,Why electoral bonds won't clean up political f...,Union finance minister Arun Jaitley needs to p...,3
5,6,"Booze, beef and gambling in Goa: BJP's Hindutv...",These double standards are not just shocking b...,3
6,7,Beef politics is taking a bite out of Meghalay...,Party leader in northeastern state quits over ...,3
7,8,What Shivraj Singh Chouhan did when Modi was t...,The Madhya Pradesh chief minister has killed i...,3
8,9,No marks for Chennai’s Bala Vidya Mandir,"The school will sink into quicksand, unless it...",3
9,10,Refrain that India's armed forces can do no wr...,"Pathankot, Uri, Nagrota and Baramullah attacks...",3


# Data Preprocessing

> Applying several pre processing techniques such as removing punctuation, removing stopwords, lemmatizing, stemming, etc.



In [6]:
# Combining News_title and News_headline to create less columns
dataset['text'] = dataset['News_title'] + ' ' + dataset['News_headline']
test_set['text'] = test_set['News_title'] + ' ' + test_set['News_headline']

# removing the unnecessary columns
dataset = dataset.drop(['News_title', 'News_headline'], axis = 1)
test_set = test_set.drop(['News_title', 'News_headline'], axis=1)

# Handling missing data
dataset['text'] = dataset['text'].fillna("")
test_set['text'] = test_set['text'].fillna("")

dataset.head()

Unnamed: 0,ID,Category,text
0,1,0,"Do men enjoy sex more, or women? The Mahabhara..."
1,2,2,Why you should eat the Demonetisation laddoo O...
2,3,3,Is the world headed for a new Cold War? The ba...
3,4,1,"Demonetisation is all about Modi, either you'r..."
4,5,3,Why electoral bonds won't clean up political f...


In [7]:
!pip install nltk



In [8]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>
[nltk_data] Error loading wordnet: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


False

In [9]:
# 1.REMOVING PUNCTUATION
import string
string.punctuation

# Function to remove punctuations
def remove_punc(text):
    nonP_text = "".join([char for char in text if char not in string.punctuation])
    return nonP_text

# Handling missing values and non-string entries, then applying remove_punc
dataset["text"] = dataset["text"].apply(lambda x: remove_punc(x))
test_set["text"] = test_set["text"].apply(lambda x: remove_punc(x))

dataset.head()

Unnamed: 0,ID,Category,text
0,1,0,Do men enjoy sex more or women The Mahabharata...
1,2,2,Why you should eat the Demonetisation laddoo O...
2,3,3,Is the world headed for a new Cold War The bat...
3,4,1,Demonetisation is all about Modi either youre ...
4,5,3,Why electoral bonds wont clean up political fu...


In [10]:
# 2. TOKENIZATION
import re

#function to apply tokenization
def tokenize(text):
    tokens = re.split("\W+", text)  # W+ means all capital, small alphabets and integers 0-9
    return tokens

dataset["text"] = dataset["text"].apply(lambda x: tokenize(x))
test_set["text"] = test_set["text"].apply(lambda x: tokenize(x))

dataset.head()

Unnamed: 0,ID,Category,text
0,1,0,"[Do, men, enjoy, sex, more, or, women, The, Ma..."
1,2,2,"[Why, you, should, eat, the, Demonetisation, l..."
2,3,3,"[Is, the, world, headed, for, a, new, Cold, Wa..."
3,4,1,"[Demonetisation, is, all, about, Modi, either,..."
4,5,3,"[Why, electoral, bonds, wont, clean, up, polit..."


In [11]:
# 3. REMOVING STOPWORDS FROM TEXT
import nltk
stopwords = nltk.corpus.stopwords.words("english")

def remove_stopwords(token):
    text = [word for word in token if word not in stopwords]# to remove all stopwords
    return text

dataset["text"] = dataset["text"].apply(lambda x: remove_stopwords(x))
test_set["text"] = test_set["text"].apply(lambda x: remove_stopwords(x))

dataset.head()

Unnamed: 0,ID,Category,text
0,1,0,"[Do, men, enjoy, sex, women, The, Mahabharata,..."
1,2,2,"[Why, eat, Demonetisation, laddoo, One, laddoo..."
2,3,3,"[Is, world, headed, new, Cold, War, The, battl..."
3,4,1,"[Demonetisation, Modi, either, youre, India, H..."
4,5,3,"[Why, electoral, bonds, wont, clean, political..."


In [12]:
# 4. STEMMING
ps = nltk.PorterStemmer()

def stemming(t_text):
    text = [ps.stem(word) for word in t_text]
    return " ".join(text) #This joins the list of words back into a string

dataset["text"] = dataset["text"].apply(lambda x: stemming(x))
test_set["text"] = test_set["text"].apply(lambda x: stemming(x))

dataset.head()

Unnamed: 0,ID,Category,text
0,1,0,Do men enjoy sex women the mahabharata answer ...
1,2,2,whi eat demonetis laddoo one laddoo equal one ...
2,3,3,Is world head new cold war the battl line beco...
3,4,1,demonetis modi either your india how mani time...
4,5,3,whi elector bond wont clean polit fund union f...


# MODEL BUILDING AND EVALUATION

In [13]:
# declaring dependent and independent value
X = dataset["text"]
Y = dataset["Category"]

# Split into train/test
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, stratify=Y, random_state=42)

In [14]:
# Vectorize the data using TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()  
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

In [15]:
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
import numpy as np

# Define parameter grid for Randomized Search
param_dist = {
    'C': np.logspace(-2, 2, 5),  # Explore from 0.01 to 100
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Performing Randomized Search with cross-validation
random_search = RandomizedSearchCV(
    SVC(random_state=42), 
    param_distributions=param_dist,
    n_iter=10,  # Trying 10 random combinations
    scoring='f1_weighted',
    cv=3,  # 3-fold cross-validation
    random_state=42,
    verbose=1,
    n_jobs=-1  # Use all available cores for parallelism
)

# Training the data
random_search.fit(X_train, Y_train)

# Get the best parameters
best_params = random_search.best_params_

# Train the final model with the best parameters on the full dataset
svm_classifier = SVC(**best_params, random_state=42)
svm_classifier.fit(X_train, Y_train)

# Make predictions on the validation set
svm_predictions = svm_classifier.predict(X_test)

# Evaluate the model
f1_svm = f1_score(Y_test, svm_predictions, average='weighted')
print(f"F1-Score: {f1_svm:.4f}")

# Prepare the test set for predictions
X_pred = tfidf.transform(test_set['text'])  # Vectorize the test set text

# Making predictions on the test set (used for submission)
test_predictions = svm_classifier.predict(X_pred)

# Create submission file with correct IDs and predicted Categories
submission = pd.DataFrame({
    'ID': test_set['ID'],  # Use 'ID' from test_set
    'Category': test_predictions  # Predicted categories
})

# Save the submission file
submission.to_csv('submission.csv', index=False)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
F1-Score: 0.8535
