In [31]:
import numpy as np;
import pandas as pd;
import csv
import re
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVC
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import 	WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder


In [None]:
file = "bbc-news-data.csv"
data = pd.read_csv(file,sep="\t")

def tokenize_and_lemmatize(text):
    data.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))
    words = nltk.tokenize.WhitespaceTokenizer().tokenize(text)
    lemmatizer = WordNetLemmatizer()
 
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words).lower()

data['new content'] = data['content'].apply(tokenize_and_lemmatize)

data

In [None]:
X=data['category']
y=data['title']

X_train,X_test,y_train,y_test = train_test_split(
    X,y, random_state=104,test_size=0.25,shuffle=True
)

print("X_train:")
print(X_train.head())
print(X_train.shape)

print('')
print('X_test : ')
print(X_test.head())
print(X_test.shape)
 
print('')
print('y_train : ')
print(y_train.head())
print(y_train.shape)
 
print('')
print('y_test : ')
print(y_test.head())
print(y_test.shape)

# Naive Bayes

In [26]:
inputs = data.drop(['filename'],axis="columns",inplace=True)

In [None]:
dummies = pd.get_dummies(data['category'])
dummies.head(3)

In [None]:
inputs = pd.concat([inputs,dummies],axis="columns")
inputs.head()

In [None]:
inputs.drop('sport',axis='columns',inplace=True)
inputs.head()

In [None]:
X = data['new content']
y = data['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vectorized, y_train)

y_pred = nb_classifier.predict(X_test_vectorized)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

content = "Jovan could run in Worlds if he realy tried"
content_vectorized = vectorizer.transform([content])
predicted_category = nb_classifier.predict(content_vectorized)
print("Predicted category for '{}' is: {}".format(content, predicted_category[0]))

In [None]:
X = data['new content']
y = data['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vectorized, y_train)

data['predicted_category'] = nb_classifier.predict(vectorizer.transform(data['new content']))

sorted_data = data.sort_values(by='predicted_category')

sorted_data

In [None]:
categorical = [var for var in data.columns if data[var].dtype=='O']

data[categorical].isnull().sum()

### Random forest

In [None]:
X = data.drop(columns=['category'], axis=1)
y = data['category']

X_encoded = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

predictions = rf.predict(X_test)

print(confusion_matrix(y_test, predictions))
print("\n")
print(classification_report(y_test, predictions))

In [None]:
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data['content'])
y = data['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)

y_pred = svm_classifier.predict(X_test)

data['predicted_category'] = svm_classifier.predict(tfidf_vectorizer.transform(data['content']))

sorted_data = data.sort_values(by='predicted_category')

data

# Testing

In [None]:
from sklearn.datasets import make_blobs

X,Y = make_blobs(n_samples=500,centers =2, random_state=0,cluster_std=0.40)

plt.scatter(X[:,0],X[:,1], c=Y,s=50,cmap='spring')
plt.show()

In [None]:
xfit = np.linspace(-1, 3.5)

plt.scatter(X[:,0],X[:,1],c=Y,s=50, cmap='spring')

for m,b,d in [(1, 0.65, 0.33), (0.5, 1.6, 0.55), (-0.2, 2.9, 0.2)]:
    yfit = m*xfit+b
    plt.plot(xfit, yfit,'-k')
    plt.fill_between(xfit,yfit -d,yfit + d,edgecolor='none',
    color='#AAAAAA',alpha=0.4)

plt.xlim(-1,3.5)
plt.show()