<a href="https://colab.research.google.com/github/roitraining/PythonML/blob/Development/Ch04-DataPrep/04-01-DataPreparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Central Tendency
Pandas is a module that contains the DataFrame object.
Here we are looking at the three measures of central tendency and the count of how many objects in the DataFrame.
Then we show each unique value and how many times it occurs.

In [None]:
import pandas as pd
df = pd.DataFrame([9,10,10,11,11,11,12,12,12,13,13,13,13,14], columns=['Age'])

print ("Mean", df.Age.mean(), "Median", df.Age.median(), "Mode", df.Age.mode()[0], "Count", df.Age.count())
print (df.Age.value_counts())



## Plotting
We can visualize what the data looks like with a number of different plots.
Boxplots are useful to see the big picture on a series of numbers.
We can see min, max, mean, and the inter quartile range.

## Box Plot
Shows the minimum, maximum, mean, and inter quartile range.

In [None]:
%matplotlib inline
import matplotlib as mp
from matplotlib import pyplot as plt
plt.ylim(8,15)
df.boxplot()



## Histogram
Good for looking at how many items fall within a range.

In [None]:
%matplotlib inline
import numpy as np
df = pd.DataFrame(np.random.rand(253, 1) * 254, columns=['col1'])
df.hist(histtype='bar', ec='black')



## Bar Chart
Useful for seeing how many items are in each category.

In [None]:
df = pd.DataFrame([('Male', 10),('Male', 11), ('Female', 11), ('Female', 12), ('Female', 12)], columns=['Gender','Age'])
x = df.groupby('Gender').count()
print (x)
x.plot(kind='bar')

## Replacing Null Values with the Central Tendency

In [None]:
import pandas as pd
fatal = pd.read_csv('2012_Workplace_Fatalities_by_State.csv')
print (fatal.columns)
fatal.columns = ['State', 'NumberOfFatalities', 'RateOfFatalities', 'StateRank', 'NumberOfInjuries', 'InjuriesRate', 'PenaltiesAvg', 'PenaltiesRank', 'Inspectors', 'YearsToInspectEachWorkplaceOnce', 'StateFederal']
print (fatal.PenaltiesRank.mean())
print (fatal.PenaltiesRank[48:])
print (fatal.PenaltiesRank[48:].isnull())
fatal.PenaltiesRank = fatal.PenaltiesRank.fillna(fatal.PenaltiesRank.mean())
print (fatal.PenaltiesRank[48:])
fatal.dropna(axis = 0, inplace = True)
print (fatal.shape)

## Add and Remove Columns to a DataFrame

In [None]:
print (fatal.columns)
fatal.insert(11, 'ProgramType', pd.Categorical(fatal['StateFederal']).codes)
print (fatal[['ProgramType', 'StateFederal']][:5])
fatal.drop(['StateFederal'], axis=1, inplace=True)
print (fatal.columns)


## Change Data Type

In [None]:
print (fatal.NumberOfFatalities[48:])
fatal.NumberOfFatalities = fatal.NumberOfFatalities.fillna(0).astype(int)
print (fatal.NumberOfFatalities[48:])



## Rescale Data

In [None]:
from sklearn import preprocessing as pp
x = fatal.NumberOfFatalities.astype(float)
print (x.mean(), x.std(), x.min(), x.max())
print (x[10:15])
print (pp.scale(x, with_mean = False, with_std = False)[10:15])
print (pp.scale(x, with_mean = True, with_std = False)[10:15])
print (pp.scale(x, with_mean = False, with_std = True)[10:15])
print (pp.scale(x, with_mean = True, with_std = True)[10:15])
print (pp.scale(x, with_mean = True, with_std = True)[10:15])

r = pp.scale(x, with_mean = True, with_std = True)
fatal.NumberOfFatalities = r
print ('rescaled', fatal.NumberOfFatalities[10:15])


## Concat Data

In [None]:
df1 = pd.DataFrame([('Male', 10),('Male', 11), ('Female', 11), ('Female', 12), ('Female', 12)], columns=['Gender','Age'])
df2 = pd.DataFrame([('Male', 20),('Male', 21), ('Female', 21), ('Female', 22)], columns=['Gender','Age'])
df = pd.concat([df1, df2])
print (df)
df3 = pd.DataFrame([('John', 'Smith'), ('Joe','Average'), ('Jane', 'Doe'), ('Jill', 'Hill')], columns = ['First', 'Last'])
df = pd.concat([df1, df3], axis = 1)
print (df)


## Merge or Join DataFrames

In [None]:
person_data = {
        'id': ['1', '2', '3', '4', '5'],
        'first_name': ['John', 'Sue', 'Jack', 'Alice', 'Joe'], 
        'last_name': ['Smith', 'Miller', 'Sprat', 'Wonderland', 'Blow']}
df1 = pd.DataFrame(person_data, columns = ['id', 'first_name', 'last_name'])

skill_data = {
    'id' : ['1', '1', '2', '3', '3', '3', '5', '6'],
    'skill' : ['C++', 'Java', 'Java', 'C++', 'Java', 'Python', 'Python', 'Java']
}
df2 = pd.DataFrame(skill_data, columns = ['id', 'skill'])

print (pd.merge(df1, df2, on = 'id'))
print (pd.merge(df1, df2, how = 'left' ))



## Convert Categorical Data

In [None]:
person_data = { 'id': ['1', '2', '3', '4', '5'],       
     'first_name': ['John', 'Sue', 'Jack', 'Alice', 'Joe'],        
     'status': ['Active', 'Active', 'Pending', 'Cancelled', 'Cancelled']}
df1 = pd.DataFrame(person_data, columns = ['id', 'first_name', 'status'])
print (df1)
df1.status = pd.Categorical(df1.status).codes
print (df1)

## Dummy Encoded

In [None]:
person_data = { 'id': ['1', '2', '3', '4', '5'],       
     'first_name': ['John', 'Sue', 'Jack', 'Alice', 'Joe'],        
     'status': ['Active', 'Active', 'Pending', 'Cancelled', 'Cancelled']}
df1 = pd.DataFrame(person_data, columns = ['id', 'first_name', 'status'])
print (df1)

dummies = pd.get_dummies(df1.status, drop_first = True)
df2 = pd.concat([df1[['id','first_name']], dummies], axis = 1)
print (df2)

dummies = pd.get_dummies(df1.status, drop_first = False)
df3 = pd.concat([df1[['id','first_name']], dummies], axis = 1)
print (df3)



## Split Data into Train and Test Sets

In [None]:
print ('Split 1')
train = fatal.sample(frac=0.8,random_state=200)
test = fatal[~fatal.index.isin(train.index)]
x0 = fatal.ProgramType
x1 = train.ProgramType
x2 = test.ProgramType

print (x0.value_counts()/x0.count())
print (x1.value_counts()/x1.count())
print (x2.value_counts()/x2.count())
print (fatal.shape, train.shape, test.shape)

from sklearn.model_selection import train_test_split
print ('Split 2')
train, test = train_test_split(fatal, test_size=0.2)
x0 = fatal.ProgramType
x1 = train.ProgramType
x2 = test.ProgramType
print (x0.value_counts()/x0.count())
print (x1.value_counts()/x1.count())
print (x2.value_counts()/x2.count())
print (fatal.shape, train.shape, test.shape)


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer 

DIR = 'text'

def corpus_from_dir(folder):
    import os
    ret = dict(docs = [open(os.path.join(folder,f)).read() for f in os.listdir(folder)],
               ColNames = map(lambda x: x.split('.')[0], os.listdir(folder)))
    return ret

def tdm_df(docs, colNames = None, **kwargs):

    #initialize the  vectorizer
    vectorizer = CountVectorizer(**kwargs)
    x1 = vectorizer.fit_transform(docs)
    #create dataFrame
    df = pd.DataFrame(x1.toarray().transpose(), index = vectorizer.get_feature_names())
    if colNames is not None:
        df.columns = colNames

    return df

corpus = corpus_from_dir(DIR)
print (corpus)
df = tdm_df(docs = corpus['docs'], colNames = corpus['ColNames'], stop_words = 'english')  
print (df)


#End of notebook