# Setup

In [4]:
import numpy as np;
import pandas as pd;
import csv
import re
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.svm import SVC
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import 	WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
import pickle


In [5]:
file = "bbc-news-data.csv"
data = pd.read_csv(file,sep="\t")

def tokenize_and_lemmatize(text):
    data.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))
    words = nltk.tokenize.WhitespaceTokenizer().tokenize(text)
    lemmatizer = WordNetLemmatizer()
 
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words).lower()

data['new content'] = data['content'].apply(tokenize_and_lemmatize)

data

Unnamed: 0,category,filename,title,content,new content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...,quarterly profit at us medium giant timewarner...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...,the dollar ha hit it highest level against the...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...,the owner of embattled russian oil giant yukos...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...,british airways ha blamed high fuel price for ...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...,shares in uk drink and food firm allied domecq...
...,...,...,...,...,...
2220,tech,397.txt,BT program to beat dialler scams,BT is introducing two initiatives to help bea...,bt is introducing two initiative to help beat ...
2221,tech,398.txt,Spam e-mails tempt net shoppers,Computer users across the world continue to i...,computer user across the world continue to ign...
2222,tech,399.txt,Be careful how you code,A new European directive could put software w...,a new european directive could put software wr...
2223,tech,400.txt,US cyber security chief resigns,The man making sure US computer networks are ...,the man making sure us computer network are sa...


In [None]:
X=data['category']
y=data['title']

X_train,X_test,y_train,y_test = train_test_split(
    X,y, random_state=104,test_size=0.25,shuffle=True
)

print("X_train:")
print(X_train.head())
print(X_train.shape)

print('')
print('X_test : ')
print(X_test.head())
print(X_test.shape)
 
print('')
print('y_train : ')
print(y_train.head())
print(y_train.shape)
 
print('')
print('y_test : ')
print(y_test.head())
print(y_test.shape)

# Naive Bayes

In [None]:
inputs = data.drop(['filename'],axis="columns",inplace=True)

In [None]:
dummies = pd.get_dummies(data['category'])
dummies.head(3)

In [None]:
inputs = pd.concat([inputs,dummies],axis="columns")
inputs.head()

In [None]:
inputs.drop('sport',axis='columns',inplace=True)
inputs.head()

In [4]:
X = data['new content']
y = data['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vectorized, y_train)

y_pred = nb_classifier.predict(X_test_vectorized)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

content = "I am going to become world's best basketball player"
content_vectorized = vectorizer.transform([content])
predicted_category = nb_classifier.predict(content_vectorized)
print("Predicted category for '{}' is: {}".format(content, predicted_category[0]))

Accuracy: 0.9775280898876404
Predicted category for 'I am going to become world's best basketball player' is: sport


In [None]:
X = data['new content']
y = data['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vectorized, y_train)

predicted_test_category = nb_classifier.predict(X_test_vectorized)

predictions_df = pd.DataFrame({'predicted_category': predicted_test_category}, index=X_test.index)

data_with_predictions = pd.merge(data, predictions_df, left_index=True, right_index=True)

sorted_data = data_with_predictions.sort_values(by='predicted_category')

sorted_data

In [None]:
X = data.drop(columns=['category'], axis=1)
y = data['category']

X_encoded = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3)

rf = MultinomialNB()
rf.fit(X_train, y_train)

predictions = rf.predict(X_test)

print(confusion_matrix(y_test, predictions))
print("\n")
print(classification_report(y_test, predictions))

In [None]:
categorical = [var for var in data.columns if data[var].dtype=='O']

data[categorical].isnull().sum()

# Logistic Regression

In [5]:
X = data['content']
y = data['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

lg_classifier = LogisticRegression()
lg_classifier.fit(X_train_vectorized, y_train)

y_pred = lg_classifier.predict(X_test_vectorized)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

def predict_category(text):
    text_vectorized = vectorizer.transform([text])
    predicted_category = lg_classifier.predict(text_vectorized)
    return predicted_category[0]

text_to_predict = "Jovan is a businessman" 
predicted_category = predict_category(text_to_predict)
print("Predicted category for '{}' is: {}".format(text_to_predict, predicted_category))


Accuracy: 0.9820224719101124
Predicted category for 'Jovan is a businessman' is: sport


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
X = data.drop(columns=['category'], axis=1)
y = data['category']

X_encoded = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3)

lg = LogisticRegression()
lg.fit(X_train, y_train)

predictions = lg.predict(X_test)

print(confusion_matrix(y_test, predictions))
print("\n")
print(classification_report(y_test, predictions))

# Random forest

In [None]:
X = data.drop(columns=['category'], axis=1)
y = data['category']

X_encoded = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

predictions = rf.predict(X_test)

print(confusion_matrix(y_test, predictions))
print("\n")
print(classification_report(y_test, predictions))

In [None]:
X = data["content"]
y = data["category"]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)

vectorizer = TfidfVectorizer()
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

model = RandomForestClassifier()
model.fit(X_train_features, y_train)

new_text = "This computer is great"

new_text_features = vectorizer.transform([new_text])

predicted_category = model.predict(new_text_features)[0]

print(f"Predicted category for '{new_text}': {predicted_category}")

In [None]:
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data['content'])
y = data['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)

y_pred = svm_classifier.predict(X_test)

data['predicted_category'] = svm_classifier.predict(tfidf_vectorizer.transform(data['content']))

sorted_data = data.sort_values(by='predicted_category')

data

# Testing

In [None]:
from sklearn.datasets import make_blobs

X,Y = make_blobs(n_samples=500,centers =2, random_state=0,cluster_std=0.40)

plt.scatter(X[:,0],X[:,1], c=Y,s=50,cmap='spring')
plt.show()

In [None]:
xfit = np.linspace(-1, 3.5)

plt.scatter(X[:,0],X[:,1],c=Y,s=50, cmap='spring')

for m,b,d in [(1, 0.65, 0.33), (0.5, 1.6, 0.55), (-0.2, 2.9, 0.2)]:
    yfit = m*xfit+b
    plt.plot(xfit, yfit,'-k')
    plt.fill_between(xfit,yfit -d,yfit + d,edgecolor='none',
    color='#AAAAAA',alpha=0.4)

plt.xlim(-1,3.5)
plt.show()

# K-Nearest Neighbour

In [None]:
X = data["new content"]
y = data["category"]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

vectorizer = TfidfVectorizer()
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

model = KNeighborsClassifier()
model.fit(X_train_features, y_train)

new_text = "This computer is great"
new_text_features = vectorizer.transform([new_text])
predicted_category = model.predict(new_text_features)[0]

print(f"Predicted category for '{new_text}': {predicted_category}")

In [None]:
X = data.drop(columns=['category'], axis=1)
y = data['category']

X_encoded = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3)

rf = KNeighborsClassifier()
rf.fit(X_train, y_train)

predictions = rf.predict(X_test)

print(confusion_matrix(y_test, predictions))
print("\n")
print(classification_report(y_test, predictions))

# Support Vector Machines

In [None]:
X = data["new content"]
y = data["category"]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

vectorizer = TfidfVectorizer()
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

model = SVC()
model.fit(X_train_features, y_train)

new_text = "The new phones are all the rage, they are getting better then the computers"
new_text_features = vectorizer.transform([new_text])
predicted_category = model.predict(new_text_features)[0]

print(f"Predicted category for '{new_text}': {predicted_category}")

In [None]:
X = data.drop(columns=['category'], axis=1)
y = data['category']

X_encoded = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3)

rf = SVC()
rf.fit(X_train, y_train)

predictions = rf.predict(X_test)

print(confusion_matrix(y_test, predictions))
print("\n")
print(classification_report(y_test, predictions))

# Picke

In [6]:
X = data["new content"]
y = data["category"]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

vectorizer = TfidfVectorizer()
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

kneighbour = KNeighborsClassifier()
kneighbour.fit(X_train_features, y_train)

file = 'kneighbour.pkl'
pickle.dump(kneighbour,open(file,'wb'))
vectorized_file = 'knvectoriser.pkl'
pickle.dump(vectorizer,open(vectorized_file,'wb'))

In [None]:
filename = 'kneighbour.pkl'
loaded_model = pickle.load(open(filename, 'rb'))

new_text = "This computer is great"
new_text_features = vectorizer.transform([new_text])
predicted_category = loaded_model.predict(new_text_features)[0]

print(f"Predicted category for '{new_text}': {predicted_category}")

Predicted category for 'This computer is great': tech


In [7]:
X = data["new content"]
y = data["category"]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

vectorizer = TfidfVectorizer()
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

model = SVC()
model.fit(X_train_features, y_train)

filename='finalised_model.pkl'
pickle.dump(model,open(filename,'wb'))
vectorized_file='svcvectorised.pkl'
pickle.dump(vectorizer,open(vectorized_file,'wb'))


In [None]:
filename = 'finalised_model.pkl'
loaded_model = pickle.load(open(filename, 'rb'))

new_text = "A world famous singer is going to do another concert"

new_text_features = vectorizer.transform([new_text])
predicted_category = loaded_model.predict(new_text_features)[0]

print(f"Predicted category for {new_text}:", predicted_category)

Predicted category for A world famous singer is going to do another concert: entertainment


In [8]:
X = data['new content']
y = data['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

filename = 'naivebayes.pkl'
filename_vectorizer = 'vectorizer.pkl'
pickle.dump(model,open(filename,'wb'))
pickle.dump(vectorizer,open(filename_vectorizer,'wb'))


In [None]:
filename = 'naivebayes.pkl'
loaded_model = pickle.load(open(filename,'rb'))

y_pred = model.predict(X_test_vectorized)

content = "The new phones are all the rage, they are getting better then the computers"
content_vectorized = vectorizer.transform([content])
predicted_category = model.predict(content_vectorized)
print("Predicted category for '{}' is: {}".format(content, predicted_category[0]))

Predicted category for 'The new phones are all the rage, they are getting better then the computers' is: tech


In [9]:
X = data["content"]
y = data["category"]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)

vectorizer = TfidfVectorizer()
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

model = RandomForestClassifier()
model.fit(X_train_features, y_train)

file = 'randomforest.pkl'
pickle.dump(model,open(file,'wb'))
vectorized_file = 'rfvectorizer.pkl'
pickle.dump(vectorizer,open(vectorized_file,'wb'))

In [None]:
filename = 'randomforest.pkl'
loaded_model = pickle.load(open(filename,'rb'))

new_text = "The dollar is doing perfectly"
new_text_features = vectorizer.transform([new_text])
predicted_category = loaded_model.predict(new_text_features)[0]

print(f"Predicted category for '{new_text}': {predicted_category}")

Predicted category for 'The dollar is doing perfectly': sport


In [10]:
X = data['content']
y = data['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

y_pred = model.predict(X_test_vectorized)

file = 'logisticregresion.pkl'
pickle.dump(model,open(file,'wb'))
vectorized_file = 'vectorised.pkl'
pickle.dump(vectorizer,open(vectorized_file,'wb'))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
filename = 'logisticregresion.pkl'
loaded_model = pickle.load(open(filename,'rb'))

def predict_category(text):
    text_vectorized = vectorizer.transform([text])
    predicted_category = loaded_model.predict(text_vectorized)
    return predicted_category[0]

text_to_predict = "Jovan is a businessman" 
predicted_category = predict_category(text_to_predict)
print("Predicted category for '{}' is: {}".format(text_to_predict, predicted_category))

Predicted category for 'Jovan is a businessman' is: sport


## Chosing the model

In [None]:
context = input("Input the text you want to categorise: ")
def question():
    print("which machine learning model do you want to use: ")
    print("1)Logistic Regression")
    print("2)Naive Bayes ")
    print("3)K-Nearest Neighbors")
    print("4)Random Forest")
    print("5)Support Vector Machine")

question()

def sol(**kwargs):
    ml = input("Choose which model you want to use: ")
    if ml not in {'1', '2', '3', '4', '5'}:
        print("Invalid choice!")
        question()
        return sol(**kwargs)
    else:
        if ml == '1':
            filename = 'logisticregresion.pkl'
            filename_vectorizer = 'vectorised.pkl'
        elif ml == '2':
            filename = 'naivebayes.sav'
            filename_vectorizer = 'vectorizer.pkl'
        elif ml == '3':
            filename = 'kneighbour.pkl'
            filename_vectorizer = 'knvectoriser.pkl'
        elif ml == '4':
            filename = 'randomforest.pkl'
            filename_vectorizer = 'rfvectorizer.pkl'
        elif ml == '5':
            filename = 'finalised_model.pkl'
            filename_vectorizer = 'svcvectorised.pkl'

    loaded_model = pickle.load(open(filename, 'rb'))
    loaded_vectorizer = pickle.load(open(filename_vectorizer, 'rb'))

    text_vectorized = loaded_vectorizer.transform([kwargs['context']])
    predicted_category = loaded_model.predict(text_vectorized)[0]
    print(f"Predicted category for '{kwargs['context']}': {predicted_category}")

context = input("Input the text you want to categorise: ")
question()
sol(context=context)
