In [1]:
# text classification

# let's make predictions!


# End goal:

# model.predict(my_text)  # it'll tell us the classification of that text

In [2]:
# two basic types of machine learning:
# - supervised learning
#     teaching the computer with examples
#     asking it to make predictions based on examples

#     - classification
#     - regression

# - unsupervised learning

In [3]:
# within machine learning
# within supervised learning
# within classification

# text classification

In [4]:
# classify by source
# classify by meaning
# classify by age

# spam filter

In [6]:
!wget http://files.lerner.co.il/text-analysis.zip

--2020-10-14 22:17:18--  http://files.lerner.co.il/text-analysis.zip
Resolving files.lerner.co.il (files.lerner.co.il)... 138.197.26.202
Connecting to files.lerner.co.il (files.lerner.co.il)|138.197.26.202|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3447 (3.4K) [application/zip]
Saving to: ‘text-analysis.zip’


2020-10-14 22:17:19 (8.25 KB/s) - ‘text-analysis.zip’ saved [3447/3447]



In [7]:
!unzip text-analysis.zip

Archive:  text-analysis.zip
  inflating: spotify-support.csv     
  inflating: apple-support.csv       


In [8]:
!head apple-support.csv

tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
119237,105834,True,Wed Oct 11 06:55:44 +0000 2017,@AppleSupport causing the reply to be disregarded and the tapped notification under the keyboard is opened😡😡😡,119236,
119248,AppleSupport,False,Wed Oct 11 13:38:29 +0000 2017,@105837 We can help. Which version of iOS are you on? You can find that in Settings &gt; General &gt; About. Reply in DM. https://t.co/GDrqU22YpT,,119249
119249,105837,True,Wed Oct 11 07:37:27 +0000 2017,"@105838 @AppleSupport Me too am suffering , hope the can find a solution",119248,119250
119250,105838,True,Wed Oct 11 05:33:17 +0000 2017,"@AppleSupport hi #apple, I’ve a concern about the latest ios is too slow on #iphone6 and i am not happy with it. Any solution please?","119249,119251",
119252,AppleSupport,False,Wed Oct 11 13:40:27 +0000 2017,@105839 Thanks for reaching out to us. We are always happy to help. Send us a DM so we can look into this together. https://t.co/GDr

In [5]:
# (1) get some input text, for which we know the classification
# (2) clean up the text / put it into a format the model could use
# (3) part of that is: defining a vocabulary and creating a term matrix
# (4) create the model
# (5) split our data into training and testing data
# (6) make some predictions
# (7) check -- how good are our predictions?

# part 1: Get the text into data frames

In [9]:
%pylab inline
import pandas as pd
from pandas import Series, DataFrame

Populating the interactive namespace from numpy and matplotlib


In [11]:
apple_df = pd.read_csv('apple-support.csv', usecols=['text'])
spotify_df = pd.read_csv('spotify-support.csv', usecols=['text'])

In [13]:
apple_df.shape

(24, 1)

In [14]:
spotify_df.shape

(16, 1)

In [15]:
# add a column indicating which source the tweets are from
apple_df['target'] = 0
spotify_df['target'] = 1

In [16]:
df = pd.concat([apple_df, spotify_df])
df.head()

Unnamed: 0,text,target
0,@AppleSupport causing the reply to be disregar...,0
1,@105837 We can help. Which version of iOS are ...,0
2,"@105838 @AppleSupport Me too am suffering , ho...",0
3,"@AppleSupport hi #apple, I’ve a concern about ...",0
4,@105839 Thanks for reaching out to us. We are ...,0


In [17]:
df.shape

(40, 2)

In [18]:
apple_df = pd.read_csv('apple-support.csv', usecols=['text'])
spotify_df = pd.read_csv('spotify-support.csv', usecols=['text'])

apple_df['target'] = 0
spotify_df['target'] = 1

df = pd.concat([apple_df, spotify_df])

In [19]:
df

Unnamed: 0,text,target
0,@AppleSupport causing the reply to be disregar...,0
1,@105837 We can help. Which version of iOS are ...,0
2,"@105838 @AppleSupport Me too am suffering , ho...",0
3,"@AppleSupport hi #apple, I’ve a concern about ...",0
4,@105839 Thanks for reaching out to us. We are ...,0
5,@105841 We'd love to help! Please DM us and le...,0
6,@AppleSupport after the 11.0.2 my phone just s...,0
7,"@105843 Battery life is important, and we're h...",0
8,@105844 Thanks for reaching out to us. We are ...,0
9,@AppleSupport Can you get my iPhone 7plus back...,0


In [23]:
df.iloc[0]['text']

'@AppleSupport causing the reply to be disregarded and the tapped notification under the keyboard is opened😡😡😡'

In [24]:
# text #1 = have a nice day
# text #2 = what day is today?

# vocabulary

# document term matrix (DTM)

# have nice day what today
# 1     1   1   0     0
# 0     0   1   1     1

# we're going to use the "count vectorizer" to create a vocabulary
# and a DTM

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()



In [25]:
# have the vectorizer establish the vocabulary from our texts
cv.fit(df['text'])

CountVectorizer()

In [26]:
cv.get_feature_names()

['10',
 '105837',
 '105838',
 '105839',
 '105840',
 '105841',
 '105843',
 '105844',
 '105846',
 '105847',
 '105848',
 '105849',
 '105851',
 '105852',
 '105857',
 '11',
 '12',
 '2014',
 '2016',
 '22',
 '320',
 '65',
 '76099',
 '76495',
 '7plus',
 '857',
 '91226',
 'about',
 'affect',
 'after',
 'again',
 'air',
 'all',
 'also',
 'always',
 'am',
 'amp',
 'an',
 'and',
 'android',
 'anker',
 'any',
 'anything',
 'app',
 'apple',
 'applesupport',
 'apps',
 'are',
 'armv7',
 'assistance',
 'at',
 'away',
 'ay',
 'back',
 'battery',
 'be',
 'been',
 'before',
 'behaviors',
 'bluetooth',
 'brilliant',
 'broken',
 'bug',
 'but',
 'can',
 'carry',
 'cause',
 'causing',
 'check',
 'cheers',
 'close',
 'co',
 'code',
 'come',
 'comes',
 'concern',
 'constantly',
 'could',
 'cp',
 'crash',
 'crossed',
 'currently',
 'days',
 'device',
 'did',
 'difference',
 'disconnects',
 'disgrace',
 'disregarded',
 'distance',
 'dm',
 'does',
 'dropped',
 'early',
 'else',
 'eqisdmwzat',
 'everything',
 'exac

In [27]:
# create a DTM from our texts, using the vocabulary
# we'll call this DTM X, because it'll be our input for model predictions

X = cv.transform(df['text'])

In [28]:
X

<40x317 sparse matrix of type '<class 'numpy.int64'>'
	with 787 stored elements in Compressed Sparse Row format>

In [29]:
y = df['target']

In [30]:
# we're going to use the Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()   

In [31]:
# classic way to teach a model... give it all X and all y
model.fit(X, y)

# now that we have trained our model, let's make some predictions!
y_pred = model.predict(X)

In [32]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred, y)

1.0

In [33]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [36]:
y.values

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [38]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB

cv = CountVectorizer()

cv.fit(df['text'])            # teach CV the vocab based on df['text']
X = cv.transform(df['text'])  # create a DTM (X) based on df['text']
y = df['target']              # create a y based on df['target']

model = MultinomialNB()       # create a new model
model.fit(X, y)               # train the model with all of our data

y_pred = model.predict(X)     # make a prediction
accuracy_score(y_pred, y)     # was the prediction any good?

1.0

In [43]:
# we'll break up our X and y into training and testing data
# we'll train (fit) with the training data
# we'll test (predict) with the testing data

# this checks if our model can correctly classify things it hasn't seen before

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

cv = CountVectorizer()

cv.fit(df['text'])            # teach CV the vocab based on df['text']
X = cv.transform(df['text'])  # create a DTM (X) based on df['text']
y = df['target']              # create a y based on df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y)

model = MultinomialNB()       # create a new model
model.fit(X_train, y_train)   # train the model with all of our data

y_pred = model.predict(X_test)     # make a prediction
accuracy_score(y_pred, y_test)     # was the prediction any good?

1.0

In [52]:
# let's try it with some data of our own!

test_text = ['my iphone is not working',
            'my spotify playlist is the best',
            'my playlist is not coming up',
            'my macbook is having trouble',
            'spotify spotify spotify spotify spotify',
            'I love to listen to music',
            'spotify premium',
            'brilliant']

test_dtm = cv.transform(test_text)

In [53]:
model.predict(test_dtm)

array([0, 0, 0, 0, 1, 0, 1, 1])

In [54]:
spotify_df['text']

0     @105840 Hi there! What device is this happenin...
1     @SpotifyCares Thanks! Version 8.4.22.857 armv7...
2     @105840 Thanks. The distance could possibly af...
3     @SpotifyCares No, but I've moved speaker to ab...
4     @105840 That's great to hear. If anything come...
5                      @SpotifyCares Brilliant thanks 😊
6     @105840 You're welcome! If there's anything el...
7     @76495 @91226 Please help! Spotify Premium ski...
8     @105847 Hi Harry! Help's here. Can you check i...
9     @SpotifyCares seems to be working at this stag...
10    @105847 Sure thing. We'll keep an eye out for ...
11    @SpotifyCares problem has come back again toda...
12    @105847 Thanks for giving us a nudge about thi...
13    @SpotifyCares It's on a Macbook Air (early 201...
14    @105847 Got it. Can you try the steps here for...
15    @SpotifyCares i've been having issues with pla...
Name: text, dtype: object

In [55]:
# we'll break up our X and y into training and testing data
# we'll train (fit) with the training data
# we'll test (predict) with the testing data

# this checks if our model can correctly classify things it hasn't seen before

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

cv = CountVectorizer()

cv.fit(df['text'])            # teach CV the vocab based on df['text']
X = cv.transform(df['text'])  # create a DTM (X) based on df['text']
y = df['target']              # create a y based on df['target']

all_scores = []

for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    model = MultinomialNB()       # create a new model
    model.fit(X_train, y_train)   # train the model with all of our data

    y_pred = model.predict(X_test)     # make a prediction
    all_scores.append(accuracy_score(y_pred, y_test))     # was the prediction any good?
    
Series(all_scores).describe()

count    100.00000
mean       0.92100
std        0.06559
min        0.80000
25%        0.90000
50%        0.90000
75%        1.00000
max        1.00000
dtype: float64

In [56]:
# Usenet -- netnews
# newsgroups 

from sklearn.datasets import fetch_20newsgroups
newsgroups = fetch_20newsgroups()

In [57]:
len(newsgroups)

5

In [58]:
dir(newsgroups)

['DESCR', 'data', 'filenames', 'target', 'target_names']

In [59]:
newsgroups['data'][:20]

["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n",
 "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 

In [61]:
newsgroups['target'][:20]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4,  8, 19,  4, 14,  6,  0,  1,
        7, 12,  5])