<img src="./python.png">

# Contents

<div class="alert-block alert-info">

<h2> 1. Python Basics</h2>
</div>

* *Python data types and functions*
* *List Comprehensions*

<div class="alert-block alert-info">
<h2> 2. Feature Engineering with Pandas</h2>
</div>

* *Dataframes - Create, Read & Write*
* *Transformations - Drop, Filter & transform variables*
* *Get Data Stats*
* *Missing Imputation*
* *Capping and Flooring*
* *Handling Categorical Features*
* *Data Scaling*

<div class="alert-block alert-info">
<h2> 3. Advanced Features</h2>
</div>

* *Sorting, Grouping and Merging*
* *Iterating over rows*
* *Advanced functions*
* *Multi processing*
* *Plot functions*
* *Features Selection*



### Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import numpy.ma as ma
import time
from functools import reduce

from sklearn import preprocessing
import sklearn
import datetime
from dateutil.relativedelta import relativedelta
from multiprocessing import Pool
from functools import partial
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_boston,load_iris
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

from sklearn.cluster import KMeans
from numpy.random import rand
# import networkx as nx
import copy
import warnings
warnings.filterwarnings("ignore") 

<div class="alert alert-block alert-info">

<h2> 1. Python Basics</h2>
</div>

* *Python data types and functions*
* *List Comprehensions*



#### Python data types
 - Variables: int, float, string, boolean
 - Collections: List, Dict, Set, Tuple

In [None]:
ints = [1,1/2, int(1.0)]                     ## int python3 1/2 will return 0.5 (float)
print (ints)
print ([type(i) for i in ints] , "\n")

floats = [1.0, 1.0/2, float(1)]
print (floats)
print ([type(f) for f in floats], "\n")

strings = ["abcdef", str(1), str(floats)]
print (strings)
print ([type(s) for s in strings], "\n")

new_list = [5,1,2,3,3]
new_list.append(4)
new_list.remove(1)
new_list.sort()
print (new_list , "\n")

# Slicing
print (new_list[:3], new_list[3:], new_list[:-3], new_list[-3:])
new_set = {6,7}
new_set.add(8)
new_set = new_set.union(set(new_list))
print (new_set, "\n")

new_dict = {"int" : 1, "float" : 1.0, "String" : "abc", "boolean" : True, "List" : [1,2,3], "Set" : {1,2,3}, "Dict" : {"a" : 1, "b" : 2}}
print (new_dict)

In [None]:
import copy

# shallow copy
d=np.mat([[2,3],[1,0]])
print ("d:", d)
e=d
e[1,1]=5
print ("shallow copy:")
print ("e:", e,"\n",  "d:", d)

# deep copy
a = [1,2,3]
print ("a:",a)
b = copy.deepcopy(a)
b.remove(1)
print ("deep copy:")
print ("a:",a, "b:",b)

#### Functions and Lambda Function

In [None]:
def add(a,b):
    return a+b

add = lambda a,b : a+b
print (add(5,6))

#### List Comprehension
 - Short hand to transform a list using for loop

In [None]:
a = [1, 2, 3]
print ([str(i) for i in a])

scores = {"off1" : 0.1, "off2" : 0.5, "off3" : 0.2, "off4" : 0.0, "off5" : 0.7}
top3_offers = [off for off, score in sorted([(o,s) for o,s in scores.items()], key = lambda x : x[1], reverse = True)][:3]
print (top3_offers)

In [None]:
a = [1, 2, 3]
print ([str(i) for i in a])

In [None]:
words =["abc", "def"]
[c for w in words for c in w]

#### Map and Reduce

In [None]:
from functools import reduce

# r = map(func, seq)
a = [1,2,3,4,5,6]
b = map(lambda x : x**2, a)
print (list(b))

a = [1,2,3,4,5,6]
b = reduce(lambda x,y : x+y, a)
print (b)

<div class="alert alert-block alert-info">
<h2> 2. Feature Engineering with Pandas</h2>
</div>

* *Dataframes - Create, Read & Write*
* *Transformations - Drop, Filter & transform variables*
* *Get Data Stats*
* *Missing Imputation*
* *Capping and Flooring*
* *Handling Categorical Features*
* *Data Scaling*

#### Creating  DataFrame from Collections
 - The inital set of baby names and birth rates

In [None]:
names  = ['Bob','Jessica','Mary','John','Mel']
births = [968, 155, 77, 578, 973]
BabyDataSet = list(zip(names,births))
print (BabyDataSet, "\n")
df1 = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births'])
print (df1, "\n")
df2 = pd.DataFrame({"Names" : names, "Births" : births})
df2 = df2[['Names', 'Births']]
print (df2)

#### Write DataFrame to csv

In [None]:
df1.to_csv('births1880.csv',index=False,header=False)

In [None]:
Location = './'

#### Reading Data into DataFrames
- header    :None if header not present in data
- sep       :specify the delimiter
- na_values : values to be considered as null
- dtype     : E.g. {‘a’: np.float64, ‘b’: np.int32} 
- nrows     : number of rows to be read
- error_bad_lines:Lines with too many fields (e.g. a csv line with too many commas) will by default cause an exception   to be raised, and no DataFrame will be returned. If False, then these “bad lines” will dropped from the DataFrame that is returned



In [None]:
import pandas as pd

df = pd.read_csv(Location + 'TitanicTrain.csv')
# df = pd.read_csv(Location + 'TitanicTrain.csv', header=None, sep = ',', dtype = None, nrows=  )
print (df.shape)
df.head()

### Subsetting data

 - loc : works on labels in the index.
 - iloc: works on the positions in the index (so it only takes integers).
 - ix  :usually tries to behave like loc but falls back to behaving like iloc if the label is not in the index.
 



In [None]:
#Subsetting data
print (df.iloc[:5,:2])
print (df.loc[:5,['PassengerId','Name']])
print (df.ix[:5,:2])
print (df.ix[:5,['PassengerId','Name']])

#### Transformations
  - drop variables
  - filter rows
  - transform a variable

In [None]:
# Transformations
df.drop(['Name','Pclass'],axis=1)# drop
df[(df.Age>20) & (df.Survived==1)]# filter ....use '|' for 'or' condition

# to find number of survived females
df['Survived_Females']= list(map(lambda x,y : 1 if (x==1) and (y=='female') else 0 ,df['Survived'],df['Sex']))#transform
print (df['Survived_Females'].value_counts())

x=[1,2,3,4]
log_transform = map(lambda x : np.log(x),x)
print ("log transformation:",list(log_transform))

df['ls']=df.apply(lambda x :list(x[['Name','Sex']]), axis=1)

#### Reading data in chunks
- You can do operations on chunks without loading the complete data 

In [None]:
# Method 1
reader = pd.read_csv(Location + 'TitanicTrain.csv', usecols=['PassengerId','Survived'] ,chunksize=2)
for chunk in reader:
    print (chunk)
    break
# Method 2
reader = pd.read_csv(Location + 'TitanicTrain.csv', iterator=True)
print (reader.get_chunk(5))

#### Get Data Stats
 - Column wise missing Counts
 - Column data Types
 - min, max, mean, median, std for numeric columns
 - value counts for catagorical columns

In [None]:
print ("Column-wise missing counts: \n")
print (df.isnull().sum())

categorical_features = [var for var in df.columns if df[var].dtype == 'O']
numerical_features = [var for var in df.columns if var not in categorical_features]
print ("\nCategoric Features:", categorical_features, "\nNumeric Features:", numerical_features)
print ("\nNumerical Feature stats:")
df[numerical_features].describe()

In [None]:
df.Pclass.value_counts() 

#### Missing Imputation

In [None]:
df.Age.fillna(df.Age.median(), inplace = True)
df.replace('Braund, Mr. Owen Harris', 'test', inplace = True)
df.head()

#### Capping and Flooring

In [None]:
df.Fare = df.Fare.clip(5,500)
df.Fare = df.Fare.clip(np.percentile(df.Fare, 5),np.percentile(df.Fare, 95))
df.Fare.describe()

#### Handling Categorical Features
 - One Hot Encoding
 - Label Encoding
 - Encoding with feature stats

In [None]:
# One Hot Encoding
df = pd.concat((df, pd.get_dummies(df.Sex, prefix = "Sex")), axis = 1)
# Label Encoding
labelEncoder = preprocessing.LabelEncoder()
df['Sex_label_encoding'] = labelEncoder.fit_transform(df.Sex)
# Encoding with feature stats
df['pclass_avg_fare'] = df[["Pclass", 'Fare']].groupby('Pclass').transform(np.mean)
df.head()

#### Data Scaling
 - MinMax Scaling
 - Standard Scaling

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
standard_scaler = preprocessing.StandardScaler()

df['Fare_min_max_scaled'] = min_max_scaler.fit_transform(df.Fare)
df['Fare_standard_scaled'] = standard_scaler.fit_transform(df.Fare)
df.head()

<div class="alert alert-block alert-info">
<h2> 3. Advanced Features</h2>
</div>

* *Sorting, Grouping and Merging*
* *Iterating over rows*
* *Advanced functions*
* *Multi processing*
* *Plot functions*
* *Features Selection*

#### Sorting, Grouping and Merging

In [None]:
#Sorting 
Sorted = df.sort_values(['Name'], ascending=True).reset_index(drop=True)# sorts and resets the index values 
Sorted = df.sort_values(['Name', 'Sex'], ascending=True)

# Grouping
df.groupby(['Pclass'])['Fare'].mean() # this creates a series with Pclass as index and Fare as values
df.groupby(['Pclass'])['Fare'].mean().reset_index() # creates a dataframe with Pclass and Fare as two columns

df.groupby(['Pclass'])['Fare'].apply(list).reset_index()# creates list of the grouped by column
df.groupby('Pclass')['Fare'].agg(['min', 'max'])

df['rank']=df.groupby(['Sex'])['Age'].rank(ascending=True)
df[['Name','Sex','Age','rank']].head()

#Merging
df_a = df.ix[:5,:]
df_b = df.ix[5:10,:]

df_a.append(df_b,ignore_index=True)
pd.concat([df_a,df_b],axis=0)#axis=1 concatenates along columns

#df_a.join(df_b)--> joins on index
#df_a.merge(df_b,on=[<key>],how='inner/left/right/outer')


#### Iterating over rows
 - iterrows   : iterating over rows as series
 - itertuples : iterating over rows as tuples

In [None]:
# iterrows
print ("iterrows:\n")
counter=0
for index, row in df.iterrows():
    print ('Index:',index)
    print ('PassengerId:',row['PassengerId'] ,",", 'Name:',row['Name'])
    counter+=1
    if counter>3:
        break
# itertuples: 
print ('\nitertuples:\n')
counter=0
for row in df.itertuples():
    print ('Index:',row[0])
    print ('PassengerId:',row.PassengerId ,",", 'Name:',row.Name,",","Others:", row[5:8])
    counter+=1
    if counter>3:
        break

#### Advanced functions

- pivot,melt,shift
- masking arrays
- random sample-itertools,dataframe.sample
- Moving average
- Datetime functions

In [None]:
# pivot 
print ("pivot:\n")
df3 = pd.DataFrame({'date':['2000-01-03','2000-01-04','2000-01-04','2000-01-03'],'var':['A','A','B','B'],
                   'value':[0.469112,-0.282863,1.212112,1.071804]})
print (df3)
print (df3.pivot(index='date', columns='var', values='value'))
# melt
print ("\n melt:\n")
df4= pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
                  'B': {0: 1, 1: 3, 2: 5},
                   'C': {0: 2, 1: 4, 2: 6}})
print (df4)
print (pd.melt(df4, id_vars=['A'], value_vars=['B', 'C']))

In [None]:
#shift
a = pd.DataFrame({"x" : [1,1,1,2,2,2] , "y" : [10,25,30,70,50,61]})
print (a)
# print a
a['z'] =a['y'] -  a['y'].shift(1)
# print a['z']
mask = a['x'] != a['x'].shift(1)
# print mask
a['z'][mask] = 0
print (a)

In [None]:
# Masking--Masked arrays are arrays that may have missing or invalid entries.

x = np.array([1, 2, 0, -1, 1])
mx = ma.masked_array(x, mask=[0, 0, 0, 1, 0])# make fourth entry invalid
print ("mx:" ,mx)
print ("mx_compressed:",ma.compressed(mx))
print ("mx_mean:",mx.mean())

z=ma.masked_values([1.0, 1.e20, 3.0, 4.0], 1.e20)#where all values close to 1.e20 are invalid
print ("z:",z)

ma.masked_where(z>2, z)


In [None]:
#Random Selection

# itertools
from itertools import permutations
from itertools import combinations
from itertools import combinations_with_replacement

print ("permutations:",list(permutations(['1','2','3'])))
print ("combinations:" ,list(combinations_with_replacement('12345',2)))

# random sample
df.sample(frac=0.7).head()# take out 70% sample from the data

In [None]:
# rolling mean,sum
df5 = pd.DataFrame({'A':["a","a","c","c","d","e"],'B': [5, 8, 2, 3,1, 4]})
print (df5)
print ("rolling mean:")
print (df5.rolling(window=2,min_periods=1).mean())# similarly sum() can be used
#cumulative sum
print ("cumulative sum:")
df5['no_cumulative'] = df5.groupby(['A'])['B'].apply(lambda x: x.cumsum())
df5

In [None]:
# date time functions 

dt = datetime.datetime.now()
print ("Time right now is:", dt)
print ("The datetime is a class. Hence it's type is:", type(dt))
print ("Hour:", dt.hour)
print ("Minute:", dt.minute)
print ("Minute:", dt.second)
print ("Microsecond:", dt.microsecond)
print ("Year:", dt.year)
print ("Date:", dt.date())


# Formatting"
print ("Time in the ISO format for quick reading:", dt.isoformat())
print ("Time in a simpler format:", dt.strftime("%Y %b %d %I:%M:%S %p"))

# Parsing a datetime from a string\n",
print ("Time parsed from the ISO format:",datetime.datetime.strptime('2016-03-03T09:21:00.737887', '%Y-%m-%dT%H:%M:%S.%f'))
print ("Time parsed from a simpler format:",datetime.datetime.strptime("2016-03-03 23:01:02", "%Y-%m-%d %H:%M:%S"))

now = datetime.datetime.today()
next_month = now + relativedelta(months=1)
last_month = now - relativedelta(months=1)
print ("Time now:", now)
print ("Time after a month:",next_month)
    
# time.time() can be used to see the runtime of codes  

#in pandas 
sample = pd.DataFrame({'year': [2015, 2016],
                       'month': [2, 3],
                       'day': [4, 5]})
pd.to_datetime(sample)
    

#### Multi processing

In [None]:
df_1 = df.ix[:5,['Survived']]
df_2 = df.ix[5:10,['Survived']]  
df_3 = df.ix[10:15,['Survived']] 

def operation(column, data):
    data[column]=data[column].map(lambda x : 'S' if x==1 else 'D')
    return data

files  =[df_1,df_2,df_3]    
column ='Survived'
pool   = Pool(processes = 3)
partial_call = partial(operation, column)  
list_data    = pool.map(partial_call, files)
pool.close()
pd.concat(list_data,axis=0)

### Plot functions

In [None]:
# simple plot 
t = np.arange(0., 5., 0.2)
print (t)
# red dashes, blue squares and green triangles
plt.plot(t, t, 'r--', t, t**2, 'bs', t, t**3, 'g^')
plt.show()

#subplots

import numpy as np
import matplotlib.pyplot as plt

def f(t):
    return np.exp(-t) * np.cos(2*np.pi*t)

t1 = np.arange(0.0, 5.0, 0.1)
t2 = np.arange(0.0, 5.0, 0.02)

plt.figure(1)
plt.subplot(211)
plt.plot(t1, f(t1), 'bo', t2, f(t2), 'k')

plt.subplot(212)
plt.plot(t2, np.cos(2*np.pi*t2), 'r--')
plt.show()


### Feature Selection

- Feature selection using SelectFromModel and LassoCV
- K means clustering
- Forming Correlation matrix

In [None]:
# Method 1 --> Feature selection using SelectFromModel and LassoCV

# Load the boston dataset.
boston = load_boston()
X, y = boston['data'], boston['target']

clf = LassoCV()

# Set a minimum threshold of 0.25
sfm = SelectFromModel(clf, threshold=0.25)
sfm.fit(X, y)
n_features = sfm.transform(X).shape[1]

# Reset the threshold till the number of features equals two.
while n_features > 2:
    sfm.threshold += 0.1
    X_transform = sfm.transform(X)
    n_features = X_transform.shape[1]

# Plot the selected two features from X.
plt.title(
    "Features selected from Boston using SelectFromModel with "
    "threshold %0.3f." % sfm.threshold)
feature1 = X_transform[:, 0]
feature2 = X_transform[:, 1]
plt.plot(feature1, feature2, 'r.')
plt.xlabel("Feature number 1")
plt.ylabel("Feature number 2")
plt.ylim([np.min(feature2), np.max(feature2)])
plt.show()

In [None]:
# K means clustering
data=pd.read_csv('./test.csv')
data=data.T
print (data.head())
k=2
# we take a transpose so as to have variables as rows
X=np.matrix(data)
print (X.shape)
kmeans = KMeans(n_clusters=k).fit(X)
labels = kmeans.labels_
centroids = kmeans.cluster_centers_

color=['o','ro']

for c,i in zip(color,range(k)):
# # select only data observations with cluster label == i
    ds = X[np.where(labels==i)]
    plt.plot(ds[:,0],ds[:,1],c)
    # plot the centroids
    lines = plt.plot(centroids[i,0],centroids[i,1],'kx')
    # make the centroid x's bigger
    plt.setp(lines,ms=15.0)
    plt.setp(lines,mew=2.0)
plt.show()

In [None]:
# Method 3 --> Forming Correlation Matrix

boston   = load_boston()
X_boston = boston.data
y_boston  = boston.target

boston=pd.DataFrame(X_boston,columns=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',\
                                  'TAX', 'PTRATIO', 'B', 'LSTAT'])

corr_matrix=boston.corr()

g = nx.Graph()
for var1 in boston.columns:
    for var2 in boston.columns:
        if var1 != var2 and corr_matrix[var1][var2] > 0.6:
            g.add_edge(var1, var2)
            
            
print ([g1.edges() for g1 in nx.connected_component_subgraphs(g)])

nx.draw(g, cmap = plt.get_cmap('jet'), node_color = ['r','g','c','y'],with_labels=True)