In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# In this assigment I will be using 3 different datasets.
## 1. Red and white wine physicochemical properties. (no feature extraction from text)
## 2. SMS Spam messages. (text extraction)
## 3. Book summaries and genres. (multiclass and multioutput classification)

# 1. Wine Dataset
Data from: https://archive.ics.uci.edu/ml/datasets/Wine  
  
I did classification on this data set to check if we can tell from just physicochemical properties of wine if it's red or white.
## Data Preparation

In [2]:
# Load red and white wine datasets
rw = pd.read_csv('winequality-red.csv', sep=';')
ww = pd.read_csv('winequality-white.csv', sep=';')

print(f'Red wine samples {len(rw)}')
print(f'White wine samples {len(ww)}')

# Add column 'type' which will be used for our classification
# Red Wine = -1
rw['type'] = -1
# White Wine = 1
ww['type'] = 1

# Combine datasets
wine_df = pd.concat([rw,ww],ignore_index=True)
# Check if we succesfully copmbined datasets
print('Combined all? ',str(len(wine_df.index) == len(ww.index)+len(rw.index)))
wine_df.head()

Red wine samples 1599
White wine samples 4898
Combined all?  True


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,-1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,-1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,-1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,-1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,-1


### Our data was combined correctly, so let's move on.

## Extract and normalize data
### We need to normalize our data (I will just scale all values to range from 0 to 1), so our LogisticRegression model coefficients are meaningful.

In [3]:
features = wine_df.columns[:11]
print('Features:',', '.join(features))
X = wine_df[features].values
y = wine_df['type'].values

from sklearn import preprocessing
# scale each column to range [0,1]
X = preprocessing.minmax_scale(X)


Features: fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol


## Model Training
### For now we will use all features except 'quality' beacuse it is arbitrary score.
### Column 'type' in this case is our classifier: -1 for red wine and 1 for white

In [4]:
model = LogisticRegression(max_iter = 2000)
X_train,X_test, y_train,y_test = train_test_split(X,y, random_state=24, test_size = 0.25)
model.fit(X_train,y_train)

LogisticRegression(max_iter=2000)

In [5]:
feat_df = pd.DataFrame({'name':features,'coef':model.coef_[0]})

y_pred = model.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(model.classes_)
feat_df.sort_values('coef').style.hide_index()

0.9833846153846154
[-1  1]


name,coef
volatile acidity,-8.749663
fixed acidity,-7.138174
sulphates,-6.436643
chlorides,-5.854486
pH,-5.645728
density,-5.047859
alcohol,1.098243
citric acid,1.114819
free sulfur dioxide,1.448624
residual sugar,4.155372


# 2. SMS Spam

## Data Preparation
Data from: https://www.dt.fee.unicamp.br/~tiago/smsspamcollection/  
  
Text file contains 2 columns, first tells us if following message is spam (spam) or not (ham), and second colun is message itself.  
Based on this I will replace 'spam' with -1 and 'ham' with 1.  
Also our data doesn't contain any NaN values, so we don't need to worry about it.  

In [6]:
sms_df = pd.read_csv('SMSSpamCollection.txt',sep='\t',header = None,names = ['Type','Text'])
# spam sms is -1 and legit sms is 1
sms_df['Type'].replace('spam',-1,inplace = True)
sms_df['Type'].replace('ham',1,inplace = True)
sms_df.head()

Unnamed: 0,Type,Text
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,-1,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


To prepare our text data for feature extraction we need to remove all non meaningful characters/words from text.  
Firstly we will remove all punctuation and then using **ntlk** library, we'll remove all *english* stop words.  
I will also save non processed text so we can later display whole messages.  

In [7]:
# remove punctuation
import string
sep = chr(0)
trans = str.maketrans('','',string.punctuation)
sms_df['Text old'] = sms_df['Text']
sms_df['Text'] = sep.join(sms_df['Text']).translate(trans).split(sep)

import nltk
# download stopwords
# nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

# remove all english stop words
sms_df['Text'] = sms_df['Text'].apply(lambda s: ' '.join([w for w in s.split() if w not in stop]))
sms_df.head()


Unnamed: 0,Type,Text,Text old
0,1,Go jurong point crazy Available bugis n great ...,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar Joking wif u oni,Ok lar... Joking wif u oni...
2,-1,Free entry 2 wkly comp win FA Cup final tkts 2...,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say early hor U c already say,U dun say so early hor... U c already then say...
4,1,Nah I dont think goes usf lives around though,"Nah I don't think he goes to usf, he lives aro..."


Spliting data into test and train sets and vectorizing them.

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
sms_vect = CountVectorizer()
# sms_X = sms_df['Text'].values
sms_X = sms_df['Text']
sms_y = sms_df['Type'].values

sms_X_train,sms_X_test, sms_y_train,sms_y_test = train_test_split(sms_X,sms_y,random_state = 66, test_size = 0.25)

sms_X_train = sms_vect.fit_transform(sms_X_train)
# keep non vectorized reviews for display later on
sms_X_test_ = sms_X_test
sms_X_test = sms_vect.transform(sms_X_test)


## Train model

In [9]:
sms_model = LogisticRegression(max_iter = 2000)
sms_model.fit(sms_X_train,sms_y_train)


LogisticRegression(max_iter=2000)

Which words suggest that SMS may be a spam, and which suggest otherwise

In [10]:
coef = sms_model.coef_[0]
words = sms_vect.get_feature_names_out()
rank = np.argsort(coef)
print( 'Words hinting that sms is a spam:', ', '.join( np.take(words,rank[:10]) ) )
print( 'Words hinting that sms is not a spam:', ', '.join( np.take(words,rank[-10:]) ) )

Words hinting that sms is a spam: txt, ringtone, text, stop, claim, mobile, call, reply, message, 18
Words hinting that sms is not a spam: da, and, im, amp, later, sorry, ok, sir, ill, ltgt


Predicting

In [11]:
sms_y_pred = sms_model.predict(sms_X_test)

Prediction accuracy

In [12]:
print(accuracy_score(sms_y_test,sms_y_pred))

0.9748743718592965


In [13]:
sms_prob = sms_model.predict_proba(sms_X_test)
print(sms_model.classes_)
spam = sms_prob[:,0]
ham = sms_prob[:,1]


# print(np.take(sms_X_test_,np.argsort(spam)[-5:]))
# np.take(sms_df['Text old'].values,np.take(sms_X_test_,np.argsort(spam)[-5:]).keys().to_numpy())
print('Top 5 SMS with highest probability of being spam')
print(np.take(sms_df['Text old'].values,np.take(sms_X_test_,np.argsort(spam)[-5:]).keys().to_numpy()))

print('Top 5 SMS with highest probability of NOT being spam')
print(np.take(sms_df['Text old'].values,np.take(sms_X_test_,np.argsort(ham)[-5:]).keys().to_numpy()))

# print(np.take(sms_X_test_,np.argsort(ham)[-5:]))


[-1  1]
Top 5 SMS with highest probability of being spam
['You have WON a guaranteed £1000 cash or a £2000 prize. To claim yr prize call our customer service representative on 08714712394 between 10am-7pm'
 'FreeMsg: Fancy a flirt? Reply DATE now & join the UKs fastest growing mobile dating service. Msgs rcvd just 25p to optout txt stop to 83021. Reply DATE now!'
 'This message is free. Welcome to the new & improved Sex & Dogging club! To unsubscribe from this service reply STOP. msgs@150p 18 only'
 'Ur ringtone service has changed! 25 Free credits! Go to club4mobiles.com to choose content now! Stop? txt CLUB STOP to 87070. 150p/wk Club4 PO Box1146 MK45 2WT'
 'You have WON a guaranteed £1000 cash or a £2000 prize. To claim yr prize call our customer service representative on 08714712412 between 10am-7pm Cost 10p']
Top 5 SMS with highest probability of NOT being spam
['Heart is empty without love.. Mind is empty without wisdom.. Eyes r empty without dreams &amp; Life is empty without fr

# 3. Book summaries dataset
This part was mostly made from curiosity if we can make model that will be able to pretty accurately predict book genres (multiple per book) based on just book summary.


## Data Preparation
Dataset contains:
1. Wikipedia article ID
2. Freebase ID
3. Book title
4. Author
5. Publication date
6. Book genres (Freebase ID:name tuples)
7. Plot summary  

For classification we will only need 'book genres' and 'plot summary' and for convenience 'book title'  
Also we should drop every book without specified genre(s)

In [14]:
book_df = pd.read_csv('booksummaries.txt',sep='\t', usecols = [2,5,6], header=None , names=['Title','Genre','Summary'])
# remove rows with NaN values
book_df.dropna(inplace=True)
book_df.head()

# book_df['tmp'] = book_df['Genre'].apply(json.loads)


Unnamed: 0,Title,Genre,Summary
0,Animal Farm,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca..."
1,A Clockwork Orange,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan..."
2,The Plague,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...
4,A Fire Upon the Deep,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...
5,All Quiet on the Western Front,"{""/m/098tmk"": ""War novel"", ""/m/016lj8"": ""Roman...","The book tells the story of Paul Bäumer, a Ge..."


Values of 'genre' column are JSON formatted strings so we need to convert it to lists of just values (we don't need keys as they're some internal id's of database from which this data was extracted)

In [15]:
import json

# parse json string to python list
book_df['Genres List'] = book_df['Genre'].apply(lambda x: list(json.loads(x).values()))

book_df.head()


Unnamed: 0,Title,Genre,Summary,Genres List
0,Animal Farm,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca...","[Roman à clef, Satire, Children's literature, ..."
1,A Clockwork Orange,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan...","[Science Fiction, Novella, Speculative fiction..."
2,The Plague,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...,"[Existentialism, Fiction, Absurdist fiction, N..."
4,A Fire Upon the Deep,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...,"[Hard science fiction, Science Fiction, Specul..."
5,All Quiet on the Western Front,"{""/m/098tmk"": ""War novel"", ""/m/016lj8"": ""Roman...","The book tells the story of Paul Bäumer, a Ge...","[War novel, Roman à clef]"


Here I do same text preparation as in previous dataset.  
Remove punctuation and stop words.

In [16]:
import string
# nltk.download('stopwords') # uncomment when stopwords throws exception/error
from nltk.corpus import stopwords
stop = stopwords.words('english')
sep = chr(0)
trans = str.maketrans('','',string.punctuation)
book_df['Summary'] = sep.join(book_df['Summary']).translate(trans).split(sep)
book_df['Summary'] = book_df['Summary'].apply( lambda s: ' '.join([w for w in s.split() if w not in stop]))
book_df.head()

Unnamed: 0,Title,Genre,Summary,Genres List
0,Animal Farm,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...",Old Major old boar Manor Farm calls animals fa...,"[Roman à clef, Satire, Children's literature, ..."
1,A Clockwork Orange,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...",Alex teenager living nearfuture England leads ...,"[Science Fiction, Novella, Speculative fiction..."
2,The Plague,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text The Plague divided five parts In town...,"[Existentialism, Fiction, Absurdist fiction, N..."
4,A Fire Upon the Deep,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits space around Milky Way divide...,"[Hard science fiction, Science Fiction, Specul..."
5,All Quiet on the Western Front,"{""/m/098tmk"": ""War novel"", ""/m/016lj8"": ""Roman...",The book tells story Paul Bäumer German soldie...,"[War novel, Roman à clef]"


Beacuse each book has different number of genres we need to convert genres label to matrix.  
We can do this using sklearn MultiLabelBinarizer which will make binary matrix, we can later convert it back to list of genres using *inverse_transform* method.  
I will also limit number of samples to first 4000 beacuse without it training model takes about 5 minutes and with limited samples only a minute.  

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
samples = 4000
# samples = len(book_df.index)
bX = book_df['Summary'].values[:samples]
# convert lists of genres to matrix
binarizer = MultiLabelBinarizer()
by = binarizer.fit_transform(book_df['Genres List'].values[:samples])

bX_train,bX_test , by_train,by_test = train_test_split(bX,by,test_size = 0.25, random_state = 20)


In [18]:
vect = CountVectorizer()
bX_train = vect.fit_transform(bX_train)
bX_test = vect.transform(bX_test)

In [19]:
print(by.shape)

(4000, 180)


Beacuse LogisticRegression supports only binary classification and only **one** value on output we need to use something else.  
Fortunately we can convert most binary classifiers to multiclass classifiers, I will use simplest to use which is One vs Rest Classifier.  
This class from **sklearn** creates *n* different models( where n is number of classes) of specified type(here logisticregression)

In [21]:
from sklearn.multiclass import OneVsRestClassifier
book_model = LogisticRegression(max_iter = 2000)
ovr = OneVsRestClassifier(book_model)
ovr.fit(bX_train,by_train)

  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])


OneVsRestClassifier(estimator=LogisticRegression(max_iter=2000))

In [22]:
by_pred = ovr.predict(bX_test)
print(binarizer.inverse_transform(by_pred[:5]))
print(binarizer.inverse_transform(by_test[:5]))

[('Speculative fiction',), (), ('Fantasy', 'Fiction', 'Speculative fiction'), ('Mystery',), ()]
[("Children's literature", 'Fiction', 'Science Fiction', 'Speculative fiction'), ("Children's literature", 'Picture book'), ("Children's literature", 'Fantasy', 'High fantasy', 'Speculative fiction'), ('Mystery',), ('Existentialism',)]


Here we see one problem some genres are really similar like: Fantasy and High Fantasy.  
Test book had: ('Fantasy', 'Fiction', 'Speculative fiction') genres  
and model predicted ("Children's literature", 'Fantasy', 'High fantasy', 'Speculative fiction')  


In [23]:
ovr.score(bX_test,by_test)

0.09

As we can see out model perfectly predicted(number AND genres match test data) only a 9% of our test sample.  
That's why this section was mostly made to learn how to even start with such a model.  