In [73]:
import pandas as pd
import numpy as np
import string
import re

In [74]:
df = pd.read_csv("bbc-news-data.csv", sep='\t')
df.head()

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...


In [75]:
df = df.drop(['filename'], axis=1)
df = df.drop(['title'], axis=1)
df

Unnamed: 0,category,content
0,business,Quarterly profits at US media giant TimeWarne...
1,business,The dollar has hit its highest level against ...
2,business,The owners of embattled Russian oil giant Yuk...
3,business,British Airways has blamed high fuel prices f...
4,business,Shares in UK drinks and food firm Allied Dome...
...,...,...
2220,tech,BT is introducing two initiatives to help bea...
2221,tech,Computer users across the world continue to i...
2222,tech,A new European directive could put software w...
2223,tech,The man making sure US computer networks are ...


In [76]:
df.content = df.content.str.lower()

In [77]:
df = df.replace(r'[.,"\'-?:!;/<>()*\\]', '', regex=True)

In [78]:
df.head()

Unnamed: 0,category,content
0,business,quarterly profits at us media giant timewarne...
1,business,the dollar has hit its highest level against ...
2,business,the owners of embattled russian oil giant yuk...
3,business,british airways has blamed high fuel prices f...
4,business,shares in uk drinks and food firm allied dome...


In [79]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [80]:
stop_words = set(stopwords.words('english'))
tokenize = word_tokenize(str(df))
words_filtered = [word for word in tokenize if not word in stop_words]
print(words_filtered)

['category', 'content', '0', 'business', 'quarterly', 'profits', 'us', 'media', 'giant', 'timewarne', '...', '1', 'business', 'dollar', 'hit', 'highest', 'level', '...', '2', 'business', 'owners', 'embattled', 'russian', 'oil', 'giant', 'yuk', '...', '3', 'business', 'british', 'airways', 'blamed', 'high', 'fuel', 'prices', 'f', '...', '4', 'business', 'shares', 'uk', 'drinks', 'food', 'firm', 'allied', 'dome', '...', '...', '...', '...', '2220', 'tech', 'bt', 'introducing', 'two', 'initiatives', 'help', 'bea', '...', '2221', 'tech', 'computer', 'users', 'across', 'world', 'continue', '...', '2222', 'tech', 'new', 'european', 'directive', 'could', 'put', 'software', 'w', '...', '2223', 'tech', 'man', 'making', 'sure', 'us', 'computer', 'networks', '...', '2224', 'tech', 'online', 'role', 'playing', 'games', 'timeconsuming', 'b', '...', '[', '2225', 'rows', 'x', '2', 'columns', ']']


In [81]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
lemm_df = [lemmatizer.lemmatize(word) for word in words_filtered] 

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [82]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['content'])
vectorizer.get_feature_names_out()

array(['aa', 'aaa', 'aaas', ..., 'zutons', 'zvonareva', 'zvyagintsev'],
      dtype=object)

In [98]:
print(X.shape)

(2225, 31325)


In [100]:
np.unique(X.toarray())
print(X.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [101]:
y = df['category']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [102]:
from sklearn.ensemble import GradientBoostingClassifier

In [106]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train, y_train)
clf.score(X_test, y_test)

0.9393258426966292

In [115]:
from sklearn.model_selection import GridSearchCV
parametrs = { 'n_estimators': [1, 5, 10, 50, 100],
              'max_depth': [1, 2, 3, 4, 5] }

In [116]:
grid = GridSearchCV(clf, parametrs, cv=5)
grid.fit(X_train, y_train)

KeyboardInterrupt: ignored

In [None]:
grid.best_params_