In [1]:
import pandas as pd

In [3]:
hr_data = pd.read_csv('https://raw.githubusercontent.com/edyoda/data-science-complete-tutorial/master/Data/HR_comma_sep.csv.txt')

In [4]:
hr_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
satisfaction_level       14999 non-null float64
last_evaluation          14999 non-null float64
number_project           14999 non-null int64
average_montly_hours     14999 non-null int64
time_spend_company       14999 non-null int64
Work_accident            14999 non-null int64
left                     14999 non-null int64
promotion_last_5years    14999 non-null int64
sales                    14999 non-null object
salary                   14999 non-null object
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [5]:
hr_data.rename(columns={'sales':'department'}, inplace=True)

In [6]:
hr_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
satisfaction_level       14999 non-null float64
last_evaluation          14999 non-null float64
number_project           14999 non-null int64
average_montly_hours     14999 non-null int64
time_spend_company       14999 non-null int64
Work_accident            14999 non-null int64
left                     14999 non-null int64
promotion_last_5years    14999 non-null int64
department               14999 non-null object
salary                   14999 non-null object
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [7]:
feature_data = hr_data.drop(columns=['left'])

In [8]:
target_data = hr_data.left

In [10]:
feature_data.salary.value_counts()

low       7316
medium    6446
high      1237
Name: salary, dtype: int64

In [12]:
feature_data.department.value_counts()

sales          4140
technical      2720
support        2229
IT             1227
product_mng     902
marketing       858
RandD           787
accounting      767
hr              739
management      630
Name: department, dtype: int64

In [11]:
feature_data.describe()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years
count,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0
mean,0.612834,0.716102,3.803054,201.050337,3.498233,0.14461,0.021268
std,0.248631,0.171169,1.232592,49.943099,1.460136,0.351719,0.144281
min,0.09,0.36,2.0,96.0,2.0,0.0,0.0
25%,0.44,0.56,3.0,156.0,3.0,0.0,0.0
50%,0.64,0.72,4.0,200.0,3.0,0.0,0.0
75%,0.82,0.87,5.0,245.0,4.0,0.0,0.0
max,1.0,1.0,7.0,310.0,10.0,1.0,1.0


In [15]:
feature_data_num_cols = feature_data.select_dtypes(exclude=['object']).columns.tolist()

In [16]:
feature_data_num_cols

['satisfaction_level',
 'last_evaluation',
 'number_project',
 'average_montly_hours',
 'time_spend_company',
 'Work_accident',
 'promotion_last_5years']

In [17]:
feature_data_cat_cols = feature_data.select_dtypes(include=['object']).columns.tolist()

In [18]:
feature_data_cat_cols

['department', 'salary']

In [21]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

In [22]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

In [43]:
from sklearn.impute import SimpleImputer

In [44]:
pipeline_num = make_pipeline(StandardScaler())

In [24]:
pipeline_cat = make_pipeline(OrdinalEncoder())

In [26]:
preprocessor = make_column_transformer((pipeline_num, feature_data_num_cols), 
                        (pipeline_cat, feature_data_cat_cols))

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [29]:
lr_pipeline = make_pipeline(preprocessor, LogisticRegression())

In [34]:
tree_pipeline = make_pipeline(preprocessor, DecisionTreeClassifier())

In [35]:
from sklearn.model_selection import train_test_split

In [36]:
trainX, testX, trainY, testY = train_test_split(feature_data, target_data)

In [39]:
tree_pipeline.fit(trainX, trainY)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline-1',
                                                  Pipeline(memory=None,
                                                           steps=[('standardscaler',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True))],
                                                           verbose=False),
                                                  ['satisfaction_level',
                                                   'last_evaluation',
                          

In [40]:
tree_pipeline.score(testX,testY)

0.9792

In [42]:
for model in [LogisticRegression(), DecisionTreeClassifier(), KNeighborsClassifier()]:
    pipeline = make_pipeline(preprocessor, model)
    pipeline.fit(trainX, trainY)
    print (pipeline.score(testX,testY))



0.7784
0.9805333333333334
0.9472


### Horror Data

In [59]:
horror_data = pd.read_csv('https://raw.githubusercontent.com/edyoda/data-science-complete-tutorial/master/Data/horror-train.csv')

In [60]:
horror_data = horror_data[['text','author']]

In [61]:
horror_data.head()

Unnamed: 0,text,author
0,"This process, however, afforded me no means of...",EAP
1,It never once occurred to me that the fumbling...,HPL
2,"In his left hand was a gold snuff box, from wh...",EAP
3,How lovely is spring As we looked from Windsor...,MWS
4,"Finding nothing else, not even gold, the Super...",HPL


In [62]:
horror_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19579 entries, 0 to 19578
Data columns (total 2 columns):
text      19579 non-null object
author    19579 non-null object
dtypes: object(2)
memory usage: 306.0+ KB


In [63]:
from sklearn.feature_extraction.text import CountVectorizer

In [64]:
from sklearn.pipeline import make_pipeline

In [65]:
from sklearn.naive_bayes import MultinomialNB

In [66]:
from nltk.tokenize import RegexpTokenizer

In [67]:
tokenizer = RegexpTokenizer(r'[A-Za-z]+')

In [68]:
horror_data['text'] = horror_data.text.map(lambda x:tokenizer.tokenize(x))

In [69]:
horror_data.head()

Unnamed: 0,text,author
0,"[This, process, however, afforded, me, no, mea...",EAP
1,"[It, never, once, occurred, to, me, that, the,...",HPL
2,"[In, his, left, hand, was, a, gold, snuff, box...",EAP
3,"[How, lovely, is, spring, As, we, looked, from...",MWS
4,"[Finding, nothing, else, not, even, gold, the,...",HPL


In [70]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [71]:
horror_data['text'] = horror_data.text.map(lambda l: [stemmer.stem(word) for word in l])

In [72]:
horror_data.head()

Unnamed: 0,text,author
0,"[this, process, howev, afford, me, no, mean, o...",EAP
1,"[it, never, onc, occur, to, me, that, the, fum...",HPL
2,"[in, his, left, hand, was, a, gold, snuff, box...",EAP
3,"[how, love, is, spring, as, we, look, from, wi...",MWS
4,"[find, noth, els, not, even, gold, the, superi...",HPL


In [78]:
horror_data.text = horror_data.text.str.join(sep=' ')

In [79]:
cv = CountVectorizer(stop_words='english')

In [80]:
horror_data_tf = cv.fit_transform(horror_data.text)

In [81]:
horror_data_tf.shape

(19579, 14899)

In [83]:
from sklearn.model_selection import train_test_split
trainX, testX, trainY, testY = train_test_split(horror_data_tf, horror_data.author)

In [84]:
mnb = MultinomialNB()

In [85]:
mnb.fit(trainX, trainY)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [86]:
mnb.score(testX, testY)

0.8128702757916241

In [87]:
test_horror = pd.read_csv('https://raw.githubusercontent.com/edyoda/data-science-complete-tutorial/master/Data/horror-test.csv')

In [89]:
test_horror.drop(columns=['id'], inplace=True)

In [90]:
cv

Unnamed: 0,text
0,"Still, as I urged our leaving Ireland with suc..."
1,"If a fire wanted fanning, it could readily be ..."
2,And when they had broken down the frail door t...
3,While I was thinking how I should possibly man...
4,I am not sure to what limit his knowledge may ...
5,"""The thick and peculiar mist, or smoke, which ..."
6,"That which is not matter, is not at all unless..."
7,I sought for repose although I did not hope fo...
8,"Upon the fourth day of the assassination, a pa..."
9,"""The tone metaphysical is also a good one."


In [91]:
cv.transform()

{'process': 10114,
 'howev': 6245,
 'afford': 218,
 'mean': 8010,
 'ascertain': 757,
 'dimens': 3536,
 'dungeon': 3968,
 'make': 7824,
 'circuit': 2289,
 'return': 10834,
 'point': 9842,
 'whenc': 14550,
 'set': 11534,
 'awar': 939,
 'fact': 4664,
 'perfect': 9523,
 'uniform': 13753,
 'wall': 14404,
 'onc': 9004,
 'occur': 8924,
 'fumbl': 5284,
 'mere': 8103,
 'mistak': 8266,
 'left': 7422,
 'hand': 5843,
 'gold': 5582,
 'snuff': 11924,
 'box': 1543,
 'caper': 1906,
 'hill': 6100,
 'cut': 3090,
 'manner': 7875,
 'fantast': 4719,
 'step': 12283,
 'took': 13160,
 'incess': 6529,
 'air': 287,
 'greatest': 5678,
 'possibl': 9932,
 'self': 11457,
 'satisfact': 11237,
 'love': 7683,
 'spring': 12163,
 'look': 7651,
 'windsor': 14651,
 'terrac': 12905,
 'sixteen': 11782,
 'fertil': 4826,
 'counti': 2878,
 'spread': 12158,
 'beneath': 1231,
 'speckl': 12087,
 'happi': 5866,
 'cottag': 2858,
 'wealthier': 14479,
 'town': 13208,
 'year': 14825,
 'heart': 5969,
 'cheer': 2171,
 'fair': 4680,
 'no