In [1]:
# text classification

# let's make predictions!


# End goal:

# model.predict(my_text)  # it'll tell us the classification of that text

In [2]:
# two basic types of machine learning:
# - supervised learning
#     teaching the computer with examples
#     asking it to make predictions based on examples

#     - classification
#     - regression

# - unsupervised learning

In [3]:
# within machine learning
# within supervised learning
# within classification

# text classification

In [4]:
# classify by source
# classify by meaning
# classify by age

# spam filter

In [6]:
!wget http://files.lerner.co.il/text-analysis.zip

--2020-10-14 22:17:18--  http://files.lerner.co.il/text-analysis.zip
Resolving files.lerner.co.il (files.lerner.co.il)... 138.197.26.202
Connecting to files.lerner.co.il (files.lerner.co.il)|138.197.26.202|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3447 (3.4K) [application/zip]
Saving to: â€˜text-analysis.zipâ€™


2020-10-14 22:17:19 (8.25 KB/s) - â€˜text-analysis.zipâ€™ saved [3447/3447]



In [7]:
!unzip text-analysis.zip

Archive:  text-analysis.zip
  inflating: spotify-support.csv     
  inflating: apple-support.csv       


In [8]:
!head apple-support.csv

tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
119237,105834,True,Wed Oct 11 06:55:44 +0000 2017,@AppleSupport causing the reply to be disregarded and the tapped notification under the keyboard is openedðŸ˜¡ðŸ˜¡ðŸ˜¡,119236,
119248,AppleSupport,False,Wed Oct 11 13:38:29 +0000 2017,@105837 We can help. Which version of iOS are you on? You can find that in Settings &gt; General &gt; About. Reply in DM. https://t.co/GDrqU22YpT,,119249
119249,105837,True,Wed Oct 11 07:37:27 +0000 2017,"@105838 @AppleSupport Me too am suffering , hope the can find a solution",119248,119250
119250,105838,True,Wed Oct 11 05:33:17 +0000 2017,"@AppleSupport hi #apple, Iâ€™ve a concern about the latest ios is too slow on #iphone6 and i am not happy with it. Any solution please?","119249,119251",
119252,AppleSupport,False,Wed Oct 11 13:40:27 +0000 2017,@105839 Thanks for reaching out to us. We are always happy to help. Send us a DM so we can look into this together. https

In [5]:
# (1) get some input text, for which we know the classification
# (2) clean up the text / put it into a format the model could use
# (3) part of that is: defining a vocabulary and creating a term matrix
# (4) create the model
# (5) split our data into training and testing data
# (6) make some predictions
# (7) check -- how good are our predictions?

# part 1: Get the text into data frames

In [9]:
%pylab inline
import pandas as pd
from pandas import Series, DataFrame

Populating the interactive namespace from numpy and matplotlib


In [11]:
apple_df = pd.read_csv('apple-support.csv', usecols=['text'])
spotify_df = pd.read_csv('spotify-support.csv', usecols=['text'])

In [13]:
apple_df.shape

(24, 1)

In [14]:
spotify_df.shape

(16, 1)

In [15]:
# add a column indicating which source the tweets are from
apple_df['target'] = 0
spotify_df['target'] = 1

In [16]:
df = pd.concat([apple_df, spotify_df])
df.head()

Unnamed: 0,text,target
0,@AppleSupport causing the reply to be disregar...,0
1,@105837 We can help. Which version of iOS are ...,0
2,"@105838 @AppleSupport Me too am suffering , ho...",0
3,"@AppleSupport hi #apple, Iâ€™ve a concern about ...",0
4,@105839 Thanks for reaching out to us. We are ...,0


In [17]:
df.shape

(40, 2)

In [18]:
apple_df = pd.read_csv('apple-support.csv', usecols=['text'])
spotify_df = pd.read_csv('spotify-support.csv', usecols=['text'])

apple_df['target'] = 0
spotify_df['target'] = 1

df = pd.concat([apple_df, spotify_df])

In [19]:
df

Unnamed: 0,text,target
0,@AppleSupport causing the reply to be disregar...,0
1,@105837 We can help. Which version of iOS are ...,0
2,"@105838 @AppleSupport Me too am suffering , ho...",0
3,"@AppleSupport hi #apple, Iâ€™ve a concern about ...",0
4,@105839 Thanks for reaching out to us. We are ...,0
5,@105841 We'd love to help! Please DM us and le...,0
6,@AppleSupport after the 11.0.2 my phone just s...,0
7,"@105843 Battery life is important, and we're h...",0
8,@105844 Thanks for reaching out to us. We are ...,0
9,@AppleSupport Can you get my iPhone 7plus back...,0


In [23]:
df.iloc[0]['text']

'@AppleSupport causing the reply to be disregarded and the tapped notification under the keyboard is openedðŸ˜¡ðŸ˜¡ðŸ˜¡'

In [24]:
# text #1 = have a nice day
# text #2 = what day is today?

# vocabulary

# document term matrix (DTM)

# have nice day what today
# 1     1   1   0     0
# 0     0   1   1     1

# we're going to use the "count vectorizer" to create a vocabulary
# and a DTM

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

