# Text Classification Project

In [2]:
import pandas as pd
df = pd.read_csv('business.csv')
df.head()

Unnamed: 0,description,industry
0,"Markets where business sold raw, unprocessed m...",total
1,Which of the following overseas markets did th...,Education & training
2,Which of the following overseas markets did th...,Health care & social assistance
3,Which of the following overseas markets did th...,Arts & recreation services
4,Which of the following overseas markets did th...,Other services


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
x = cv.fit_transform(df.description)
x

<1333x103 sparse matrix of type '<class 'numpy.int64'>'
	with 21479 stored elements in Compressed Sparse Row format>

In [4]:
cv.vocabulary_

{'markets': 57,
 'where': 100,
 'business': 11,
 'sold': 84,
 'raw': 77,
 'unprocessed': 96,
 'materials': 58,
 'nz': 63,
 'only': 66,
 'which': 101,
 'of': 64,
 'the': 91,
 'following': 32,
 'overseas': 70,
 'did': 22,
 'this': 92,
 'sell': 81,
 'goods': 34,
 'and': 5,
 'services': 83,
 'australia': 9,
 'other': 68,
 'pacific': 72,
 'china': 13,
 'japan': 47,
 'india': 43,
 'asean': 7,
 'member': 59,
 'states': 87,
 'asia': 8,
 'usa': 97,
 'americas': 4,
 'united': 95,
 'kingdom': 49,
 'eu': 28,
 'europe': 29,
 'middle': 60,
 'east': 26,
 'africa': 2,
 'key': 48,
 'factors': 31,
 'that': 90,
 'helped': 37,
 'compete': 14,
 'in': 41,
 'experienced': 30,
 'managerial': 55,
 'staff': 86,
 'non': 61,
 'valuable': 99,
 'brand': 10,
 'high': 38,
 'quality': 76,
 'or': 67,
 'unique': 94,
 'intellectual': 45,
 'property': 75,
 'ability': 0,
 'to': 93,
 'customise': 19,
 'specific': 85,
 'customer': 17,
 'requirements': 78,
 'lower': 53,
 'production': 74,
 'costs': 16,
 'due': 24,
 'technolog

In [5]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
sw= list(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Samaa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,df.description,test_size=0.2)

In [7]:
def train(x):
    x_train,x_test,y_train,y_test = train_test_split(x,df.description,test_size=0.2)    
    clf = LogisticRegression(max_iter=10000)
    clf.fit(x_train,y_train)
    return clf.score(x_test,y_test)

In [8]:
def remove_sw(text):
    words = text.split()
    filtered_words = []
    for word in words:
        if word not in sw:
            filtered_words.append(word)
    return ' '.join(filtered_words)

In [9]:
remove_sw("Which of the following overseas markets did this business sell goods and services? - Australia")

'Which following overseas markets business sell goods services? - Australia'

In [10]:
df['clean_text'] = df.description.apply(remove_sw)
df['clean_text2'] = df.industry.apply(remove_sw)
df.head()

Unnamed: 0,description,industry,clean_text,clean_text2
0,"Markets where business sold raw, unprocessed m...",total,"Markets business sold raw, unprocessed materia...",total
1,Which of the following overseas markets did th...,Education & training,Which following overseas markets business sell...,Education & training
2,Which of the following overseas markets did th...,Health care & social assistance,Which following overseas markets business sell...,Health care & social assistance
3,Which of the following overseas markets did th...,Arts & recreation services,Which following overseas markets business sell...,Arts & recreation services
4,Which of the following overseas markets did th...,Other services,Which following overseas markets business sell...,Other services


In [11]:
x = cv.fit_transform(df.clean_text).toarray()
train(x)

1.0

In [12]:
import re

In [13]:
def onlyLetters(text):
    return re.sub('[^a-zA-Z ]*','',text) 

In [14]:
df['clean_text'] = df.clean_text.apply(onlyLetters)
df['clean_text2'] = df.clean_text2.apply(onlyLetters)
df.head()

Unnamed: 0,description,industry,clean_text,clean_text2
0,"Markets where business sold raw, unprocessed m...",total,Markets business sold raw unprocessed material...,total
1,Which of the following overseas markets did th...,Education & training,Which following overseas markets business sell...,Education training
2,Which of the following overseas markets did th...,Health care & social assistance,Which following overseas markets business sell...,Health care social assistance
3,Which of the following overseas markets did th...,Arts & recreation services,Which following overseas markets business sell...,Arts recreation services
4,Which of the following overseas markets did th...,Other services,Which following overseas markets business sell...,Other services


In [15]:
x = cv.fit_transform(df.clean_text).toarray()
train(x)

1.0

In [16]:
x.shape

(1333, 87)

# Stemming

In [25]:
from nltk.stem.snowball import SnowballStemmer

In [26]:
steming = SnowballStemmer("english")

In [27]:
def stem(message):
    tokens = message.split()
    filtered_words = []
    for token in tokens:
        filtered_words.append(steming.stem(token))
        
    return ' '.join(filtered_words)

In [28]:
steming.stem("detection")

'detect'

In [29]:
df['clean_text'] = df.clean_text.apply(stem)
df['clean_text2'] = df.clean_text2.apply(stem)
df.head()

Unnamed: 0,description,industry,clean_text,clean_text2
0,"Markets where business sold raw, unprocessed m...",total,market busi sold raw unprocess materi nz,total
1,Which of the following overseas markets did th...,Education & training,which follow oversea market busi sell good ser...,educ train
2,Which of the following overseas markets did th...,Health care & social assistance,which follow oversea market busi sell good ser...,health care social assist
3,Which of the following overseas markets did th...,Arts & recreation services,which follow oversea market busi sell good ser...,art recreat servic
4,Which of the following overseas markets did th...,Other services,which follow oversea market busi sell good ser...,other servic


In [30]:
x = cv.fit_transform(df.clean_text)
train(x)

1.0

In [34]:
x.shape

(1333, 82)