# Text Classification

In [2]:
import pandas
from sklearn.datasets import fetch_20newsgroups
training_dataset = fetch_20newsgroups(subset='train')
test_dataset = fetch_20newsgroups(subset='test')
training_data = training_dataset['data']
training_target = training_dataset['target']
test_data = test_dataset['data']
test_target = test_dataset['target']
target_names = training_dataset['target_names']

In [3]:
dataset_training = pandas.DataFrame(
    {'data':list(training_dataset['data']), 
     'target': list(training_dataset['target'])
    })

dataset_testing = pandas.DataFrame(
    {'data':list(test_dataset['data']), 
     'target': list(test_dataset['target'])
    })

In [4]:
dataset_training

Unnamed: 0,data,target
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14
...,...,...
11309,From: jim.zisfein@factory.com (Jim Zisfein) \n...,13
11310,From: ebodin@pearl.tufts.edu\nSubject: Screen ...,4
11311,From: westes@netcom.com (Will Estes)\nSubject:...,3
11312,From: steve@hcrlgw (Steven Collins)\nSubject: ...,1


In [5]:
dataset_testing

Unnamed: 0,data,target
0,From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. ...,7
1,From: Rick Miller <rick@ee.uwm.edu>\nSubject: ...,5
2,From: mathew <mathew@mantis.co.uk>\nSubject: R...,0
3,From: bakken@cs.arizona.edu (Dave Bakken)\nSub...,17
4,From: livesey@solntze.wpd.sgi.com (Jon Livesey...,19
...,...,...
7527,From: richmond@spiff.Princeton.EDU (Stupendous...,14
7528,From: smytonj@murr11.alleg.edu (Jim Smyton)\nS...,4
7529,From: hhenderson@vax.clarku.edu\nSubject: RE: ...,9
7530,From: b859zam@utarlg.uta.edu \nSubject: INTEL ...,6


In [6]:
#Visualizziamo le classi del dataset 
target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [7]:
#Filtro il dataset per costruirmi un minidataset di esempio usando le categorie 3 e 4 (Ibm e MAC). 
dataset_training = dataset_training[dataset_training['target'].isin([3,4])]
dataset_testing = dataset_testing[dataset_testing['target'].isin([3,4])]

In [8]:
#Facciamo un pò di EDA sul dataset creato
len(dataset_training)

1168

In [9]:
len(dataset_training[dataset_training['target'] == 3])

590

In [10]:
len(dataset_training[dataset_training['target'] == 4])

578

In [11]:
len(dataset_testing)

777

In [12]:
len(dataset_testing[dataset_testing['target'] == 3])

392

In [13]:
len(dataset_testing[dataset_testing['target'] == 4])

385

In [14]:
import string
import spacy
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer

english_stopwords = stopwords.words('english')
nlp = spacy.load('en_core_web_sm')
punctuation = set(string.punctuation)

def data_cleaner(dataset):
    dataset_to_return = []
    for sentence in dataset:
        sentence = sentence.lower()
        for c in string.punctuation:
            sentence = sentence.replace(c, " ")
        document = nlp(sentence)
        sentence = ' '.join(token.lemma_ for token in document)
        sentence = ' '.join(word for word in sentence.split() if word not in english_stopwords)
        sentence = re.sub('\d', '', sentence)
        dataset_to_return.append(sentence)

    return dataset_to_return

def bow_tfidf(dataset, tfidf_vectorizer):
    if tfidf_vectorizer == None:
        tfidf_vectorizer = TfidfVectorizer()
        X = tfidf_vectorizer.fit_transform(dataset)
    else:
        X = tfidf_vectorizer.transform(dataset)

    return X.toarray(), tfidf_vectorizer

In [15]:
#puliamo i dataset di training e di test
training_data_cleaned, tfidf_vectorizer = bow_tfidf(data_cleaner(dataset_training['data']), None)

In [16]:
training_data_cleaned

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [17]:
testing_data_cleaned, tfidf_vectorizer = bow_tfidf(data_cleaner(dataset_testing['data']), tfidf_vectorizer)

In [18]:
#Importiamo il MLP dalla libreria 
# cross validation da fare

from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(activation='logistic', # per attività di classificazione testuale è consigliato usare la funzione logistica
                    hidden_layer_sizes=(100,),
                    max_iter=100,
                    solver='adam', # per questo tipo di task è consigliato usare l'ottimizzatore adam
                    tol=0.005,
                    verbose=True)

clf.fit(training_data_cleaned,dataset_training['target'])

Iteration 1, loss = 0.69783602
Iteration 2, loss = 0.68076142
Iteration 3, loss = 0.67090322
Iteration 4, loss = 0.65969548
Iteration 5, loss = 0.64870477
Iteration 6, loss = 0.63859922
Iteration 7, loss = 0.62705512
Iteration 8, loss = 0.61524543
Iteration 9, loss = 0.60306762
Iteration 10, loss = 0.58985724
Iteration 11, loss = 0.57568794
Iteration 12, loss = 0.56094439
Iteration 13, loss = 0.54514709
Iteration 14, loss = 0.52843899
Iteration 15, loss = 0.51094626
Iteration 16, loss = 0.49279737
Iteration 17, loss = 0.47402542
Iteration 18, loss = 0.45485247
Iteration 19, loss = 0.43499678
Iteration 20, loss = 0.41536214
Iteration 21, loss = 0.39557790
Iteration 22, loss = 0.37557295
Iteration 23, loss = 0.35635207
Iteration 24, loss = 0.33720097
Iteration 25, loss = 0.31865075
Iteration 26, loss = 0.30085219
Iteration 27, loss = 0.28372447
Iteration 28, loss = 0.26750152
Iteration 29, loss = 0.25212862
Iteration 30, loss = 0.23745297
Iteration 31, loss = 0.22366900
Iteration 32, los

In [19]:
#TESTIAMO IL MODELLO ADDESTRATO
clf.score(testing_data_cleaned, dataset_testing['target'])

0.8828828828828829

In [20]:
target = clf.predict(bow_tfidf(data_cleaner(["IBM is one of the bigger company in the world!"]),tfidf_vectorizer)[0])[0]

In [21]:
target

3

In [22]:
target_names[target]

'comp.sys.ibm.pc.hardware'