# Machine Learning Cheat Sheet


    Created On: Timestamp('2019-10-29 15:37:20.527174')
    Created By: 'Siavash Saki'

## General Analysis

#### Chosing the best algorithm
          

<img src='https://scikit-learn.org/stable/_static/ml_map.png' width=500 align="middle" />

[Full Size Image](https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html)


#### Looking at the data
```python
df.head()
df.info()
df.describe()
sns.pairplot(df)

df.corr()
sns.heatmap(df.corr())

sns.distplot(df[TARGET]) 
sns.countplot(df[TARGET]) #categorical

#two hist on one fig
sns.distplot(a=df[df['male']==1]['Survived'],kde=False)
sns.distplot(a=df[df['male']==0]['Survived'],kde=False)
```

#### Dealing with missing data
```python
df.isnull().sum()
sns.heatmap(df.isnull(),yticklabels=False,cbar=False)
```

#### Quick Regression Test
```python
sns.regplot(x='feature',y='target',data=df)
sns.residplot(x='feature',y='target',data=df)
```

#### One Hote Encoding
```python
# Making dummy variable
cat_dummy = pd.get_dummies(df['category_i'],drop_first=True)
df=pd.concat([df,cat_dummy],axis=1)
```

#### Splitting data into train and test set
```python
X= df.drop('TARGET',axis=1)
y= df[TARGET]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=100)
```

#### Splitting Time Series
```python
from sklearn.model_selection import TimeSeriesSplit
ts= TimeSeriesSplit(n_splits=4)
cross_val_score(estimator,X,y,cv=ts)
```

#### Regression line evaluation
```python
from scipy import stats
pearson_coef, p_value = stats.pearsonr(df['wheel-base'], df['price'])
```

#### ANOVA
```python
from scipy import stats
F, p = stats.f_oneway(group_A, group_B, group_C)
```

#### Scaling Data

*MinMaxScaler:*
```python
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
scaled = min_max_scaler.fit_transform(x)
```
*StandardScaler:*
```python
from sklearn.preprocessing import StandardScaler 
ss= StandardScaler()
ss.fit(features)
scaled= ss.transform(features)
```

#### Principal component analysis (PCA)
```python
from sklearn.decomposition import PCA
pca= PCA(n_components=2)
pca.fit(scaled)
x_pca= pca.transform(scaled)

pca_components= pd.DataFrame(x_pca,columns=['First principal component','Second principal component'])
pca_data= pd.concat([pca_components,df['target']],axis=1)
sns.scatterplot(x='First principal component',y='Second principal component',hue='target',data=pca_data)
```

#### Cross Validation

for example the estimator is a support vector machines:
`svc= sklearn.svm.SVC()`

*one random train-test set:*
```python
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
svc.fit(X_train,y_train)
svc.score(X_test,y_test)


```
*n random train-test set:*
```python
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
cv= ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
score= cross_val_score(svc, X,y, cv=cv)
print(score)
print(score.mean())

# For Regression:
r2_score= cross_val_score(lr, X,y, cv=cv)
MSE= -1 * cross_val_score(lr,X,y,cv=cv,scoring='neg_mean_squared_error')
```

*cross validation {cv is by default: "(Stratified)KFold"}:*
```python
from sklearn.model_selection import cross_val_score,cross_val_predict
y_hat= cross_val_predict(svc, X,y, cv=10)
score= cross_val_score(svc, X,y, cv=10)
print(score)
print(score.mean())

# For Regression:
r2_score= cross_val_score(lr, X,y, cv=5)
MSE= -1 * cross_val_score(lr,X,y,cv=5,scoring='neg_mean_squared_error')
```

*K Fold:*
```python
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score,cross_val_predict
kf = KFold(n_splits=4,shuffle=True)
score= cross_val_score(svc, X,y, cv=kf)
print(score)
print(score.mean())
# kf.split(X) # indexes in train and test set

# For Regression: analog to cv
```

*Stratified K Fold (evenly distrubted traget class in train and test sets):*
```python
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score,cross_val_predict
skf= StratifiedKFold(n_splits=4,shuffle=True)
score= cross_val_score(svc, X,y, cv=skf)
print(score)
print(score.mean())
# skf.split(X,y) # indexes in train and test set

# For Regression: analog to cv
```

#### Grid Search

```python
from sklearn.model_selection import GridSearchCV
param_grid={'C':[0.1,1,10,100,1000],'gamma':[1,0.1,0.001,0.0001,0.00001]}
gs= GridSearchCV(estimator=SVC(),param_grid=param_grid,verbose=2)
gs.fit(X_train,y_train)
gs.best_estimator_
gs.best_params_
gs.cv_results_
y_hat= gs.predict(X_test)
```

#### Model Evaluation (classifier models)

*Jaccard index:*
```python
from sklearn.metrics import jaccard_similarity_score #depricated instead: jaccard_score
jaccard_similarity_score(y_test, y_hat)
```


*log loss:*

```python
# For logistic regression
from sklearn.metrics import log_loss
log_loss(y_test, y_hat_prob)
```


*f1-score:*
```python
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
print(accuracy_score(y_test,y_hat),'\n',
      confusion_matrix(y_test,y_hat),'\n',
      classification_report(y_test,y_hat))
```
For more details:
* **[precision and recall](https://en.wikipedia.org/wiki/Precision_and_recall)** 
* **[f1-score](https://en.wikipedia.org/wiki/F1_score)**\
<img src='https://upload.wikimedia.org/wikipedia/commons/thumb/2/26/Precisionrecall.svg/700px-Precisionrecall.svg.png' width=300 align="left" />

#### Saving and loading objects and using pickle
```python
import pickle
# save an object
with open('file_name.pkl', 'wb') as fp:
    pickle.dump(my_object, fp)
# load an object
with open('file_name.pkl', 'rb') as fp:
    my_object = peakle.load(fp)
# faster way for data frames
df.to_pickle('file_name.pkl')
df= pd.read_pickle('file_name.pkl)
```

#### Train model on new data
```python
estimator.set_params(warm_start=True)
estimator.fit(X_new,y_new)
```

____

## Linear Regression

Least Squares Regression:\
$min\ (sum\ of\ the\ squared\ residuals)$
```python
from sklearn.linear_model import LinearRegression
lm= LinearRegression()
lm.fit(X_train, y_train)

lm.intercept_
lm.coef_
pd.concat([pd.DataFrame(lm.coef_.transpose(),index=X_train.columns,columns=['coef']),
           pd.DataFrame(lm.intercept_,index=['y_intercept'],columns=['coef'])])

y_hat= lm.predict(X_test)
sns.scatterplot(y_test,y_hat)
sns.distplot(y_test-y_pred).set_title('Residuals')
lm.score(X_train,y_train)
lm.score(X_test,y_test)== metrics.r2_score(y_test,y_hat)

from sklearn import metrics
metrics.mean_absolute_error(y_test,y_hat)
metrics.mean_squared_error(y_test,y_hat)
np.sqrt(metrics.mean_squared_error(y_test,y_hat))
metrics.explained_variance_score(y_test,y_hat)
metrics.r2_score(y_test,y_hat)

```

## Regularized linear regression methods:
* add bias and reduce variance (optimum fit: min total error)
* reduce model complexity
* prevent over-fitting which may result from simple linear regression
* scale your data before using regularized linear regression methods (Standard Scaler) (Or "normalize"=True)\
<img src='https://miro.medium.com/max/481/1*cB0ESE9z3rB3-rpXPhwgWw.png' width=400 align="left" />

* ## Ridge Regression

$min\ (sum\ of\ the\ squared\ residuals\ +\ \alpha * slope^{2})$\
Ridge Regression can reduce the slope **close to zero**.
```python
from sklearn.linear_model import Ridge
# from sklearn.linear_model import RidgeCV # with cross-validation

rr = Ridge(alpha=0.01)
rr.fit(X_train, y_train)
rr.score(X_test, y_test)
mean_squared_error(y_test, rr.predict(X_test))
# Using GridSearch, we can tune alpha {'alpha':[0.0001,0.001,0.1,1,10,100], 'normalize':[True, False]}
           
```

* ## Lasso Regression

$min\ (sum\ of\ the\ squared\ residuals\ +\ \alpha * |slope|)$\
Lasso Regression can reduce the slope **to zero**. (Feature Elimination)

```python
from sklearn.linear_model import Lasso
# from sklearn.linear_model import LassoCV # with cross-validation

ls = Lasso(alpha=0.01)
ls.fit(X_train, y_train)
ls.score(X_test, y_test)
mean_squared_error(y_test, ls.predict(X_test))
# Using GridSearch, we can tune alpha {'alpha':[0.0001,0.001,0.1,1,10,100], 'normalize':[True, False]}
           
```

* ## Elastic Net Regression


hybrid of Lasso and Ridge:\
$min\ (sum\ of\ the\ squared\ residuals\ +\ \alpha_{1} * |slope| +\ \alpha_{2} * slope^{2})$


When the l1_ratio is set to 0 it is the same as ridge regression. When l1_ratio is set to 1 it is lasso. Elastic net is somewhere between 0 and 1.

```python
from sklearn.linear_model import ElasticNet
# from sklearn.linear_model import ElasticNetCV # with cross-validation

en= ElasticNet()
params={'alpha':[0.001,0.01,0.1,1,10,100],
        'l1_ratio':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
        'normalize':[True, False]}
gs= GridSearchCV(en,param_grid=params,cv=5)
gs.fit(X,y)
gs.best_score_
gs.best_params_
```


## Polynomial Regression

```python
# In case there's only one independent variable (x), we have to reshape the array
# If there's a data frame, it is normal

X= df['x'].values
y= df['y'].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

pf= PolynomialFeatures(degree=2)
X_poly= pf.fit_transform(X_train.reshape(-1,1))

lm=LinearRegression()
lm.fit(X_poly,y_train)

# y_hat= lm.predict(X_poly)
X_poly_test= pf.fit_transform(X_test.reshape(-1,1))
y_hat_test= lm.predict(X_poly_test)

# simple visualisation to avoid overfitting
r2_train=[]
r2_test=[]
for i in range(1,11):
    pf= PolynomialFeatures(degree=i)
    X_poly= pf.fit_transform(X_train.reshape(-1,1))
    lm=LinearRegression()
    lm.fit(X_poly,y_train)
    y_hat= lm.predict(X_poly)
    X_poly_test= pf.fit_transform(X_test.reshape(-1,1))
    y_hat_test= lm.predict(X_poly_test)
    r2_train.append(r2_score(y_train,y_hat))
    r2_test.append(r2_score(y_test,y_hat_test))
plt.plot(range(1,11),r2_train,'--ro')
plt.plot(range(1,11),r2_test,'--bo')

# If it is data frame, we can simply use a pipeline:
from sklearn.pipeline import Pipeline
pl= Pipeline([('scale',StandardScaler()), #if necessary
              ('pr',PolynomialFeatures(degree=2)),
              ('lm',LinearRegression())])
pl.fit(X_train,y_train)
y_hat= pl.predict(X_test)

```


## Logistic Regression

```python
from sklearn.linear_model import LogisticRegression
logm= LogisticRegression()
logm.fit(X_train,y_train)

logm.intercept_
logm.coef_
pd.concat([pd.DataFrame(logm.coef_.transpose(),index=X_train.columns,columns=['coef']),
           pd.DataFrame(logm.intercept_,index=['y_intercept'],columns=['coef'])])

y_hat= logm.predict(X_test)
y_probability= logm.predict_proba(X_test)
```

## K Nearest Neighbors

```python
# scaling the data if necessary
from sklearn.preprocessing import StandardScaler
scalar= StandardScaler()
scalar.fit(df.drop('TARGET CLASS',axis=1))
scaled_ar= scalar.transform(df.drop('TARGET CLASS',axis=1))
scaled_df= pd.DataFrame(scaled_ar,columns=df.columns[:-1])

from sklearn.neighbors import KNeighborsClassifier
knn= KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train,y_train)
y_hat= knn.predict(X_test)

a_scores=[]
err=[]
for i in range(1,50):
    knn_i= KNeighborsClassifier(n_neighbors=i)
    knn_i.fit(X_train,y_train)
    y_hat_i= knn_i.predict(X_test)
    a_scores.append(metrics.accuracy_score(y_test,y_hat_i))
    err.append(np.mean(y_hat_i != y_test))

plt.plot(err,'--ro')
plt.title('Error Rate vs. K Value')
```

## Decision Trees

```python
from sklearn.tree import DecisionTreeClassifier
dtc= DecisionTreeClassifier()
dtc.fit(X_train,y_train)
y_hat= dtc.predict(X_test)
```
#### Tree Visualization
```python
#graphviz is needed
#sudo apt-get install graphviz
from IPython.display import Image  
from sklearn.externals.six import StringIO  
from sklearn.tree import export_graphviz
import pydot 
features = list(df.columns[:-1])
dot_data = StringIO()  
export_graphviz(dtc, out_file=dot_data,feature_names=features,filled=True,rounded=True)
graph = pydot.graph_from_dot_data(dot_data.getvalue())  
Image(graph[0].create_png())  

```

## Random Forests

```python
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train,y_train)
y_hat= rfc.predict(X_test)

```

## Support Vector Machines


```python
from sklearn.svm import SVC
svm= SVC()
svm.fit(X_train,y_train)
y_hat= svm.predict(X_test)

#Grid Search
from sklearn.model_selection import GridSearchCV
param_grid={'kernel':['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
            'degree':[2,3,4], # Degree of the polynomial kernel function ('poly')
            'C':[0.1,1,10,100,1000],
            'gamma':[1,0.1,0.001,0.0001,0.00001]}
gs= GridSearchCV(estimator=SVC(),param_grid=param_grid,verbose=2)
gs.fit(X_train,y_train)
gs.best_estimator_
gs.best_params_
y_hat= gs.predict(X_test)
```

## Clustering

### K-Means Clustering

```python
from sklearn.cluster import KMeans
kmc= KMeans(n_clusters=2)
kmc.fit(data)

kmc.cluster_centers_
kmc.labels_

#Elbow methode
wcss=[]
for i in range(1,16):
    kmc= KMeans(n_clusters=i)
    kmc.fit(features)
    wcss.append(kmc.inertia_)
plt.plot(list(range(1,16)),wcss,'--ro')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Within cluster sum of squares')

```

### Hierarchical Clustering 

```python
from sklearn.cluster import AgglomerativeClustering
ac= AgglomerativeClustering(n_clusters=3,affinity='euclidean',linkage='average')
y_hat= ac.fit_predict(X)

# Dendrogram Associated for the Agglomerative Hierarchical Clustering
from scipy.cluster import hierarchy 
from scipy.spatial import distance_matrix 
dist_matrix = distance_matrix(X,X) 
Z = hierarchy.linkage(dist_matrix, 'complete')
dendro = hierarchy.dendrogram(Z)
```

### Density-Based Clustering (DBSCAN)

```python
from sklearn.cluster import DBSCAN 
epsilon = 3.6
minimumSamples = 10
db = DBSCAN(eps=epsilon, min_samples=minimumSamples).fit(X)
labels = db.labels_
```

## Pipeline

```python
# example:

from sklearn.pipeline import Pipeline
pipeline= Pipeline([('bow', CountVectorizer(stop_words=stopwords.words('english'))),
                    ('tfidf', TfidfTransformer()),
                    ('mnb', MultinomialNB()) ])
pipeline.fit(X_train,y_train)
y_hat= pipeline.predict(X_test)
```