## Useful code

#### A cheat sheet of useful python code for DS.

# Pandas

Download a dataset from internet to your current directory

In [None]:
import urllib.request
import pandas as pd
file_name = 'dataR2.csv'

def download_file(file_name):
    print('Downloading the dataset')
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv'
    urllib.request.urlretrieve(url, file_name)


download_file(file_name)

df = pd.read_csv(file_name)
df.head()

In [None]:
# Load data
train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv', index_col=0)
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv', index_col=0)
sub = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')

print('Train Shape: {}\nMissing Data: {}\nDuplicates: {}\n'\
      .format(train.shape, train.isna().sum().sum(), train.duplicated().sum()))
print('Test Shape: {}\nMissing Data: {}\nDuplicates: {}\n'\
      .format(test.shape, test.isna().sum().sum(), test.duplicated().sum()))
train_d=train.drop_duplicates() 
print('Dropping Duplicates\nNew Train Shape: {}'.format(train_d.shape))

Split your data in categorical and numerical values for encoding, standarization or analysis.

In [None]:
categorical_cols = [cname for cname in df.columns if df[cname].dtype == "object"]
categorical = df[categorical_cols]
categorical

In [None]:
numerical_cols = [cname for cname in df.columns if df[cname].dtype in 
                  ['int64', 'float64']]
numerical_cols

In [None]:
numerical_cols = [cname for cname in df.columns if df[cname].dtype in 
                  ['int64', 'float64']]
numerical = df[numerical_cols]
num = numerical.drop(columns="temp")
num.sample(3)

Describe just categorical values

In [None]:
df.describe(include=['O'])

Mesure your Nans by porcentage

In [None]:
df.isna().sum() / df.shape[0] * 100

Pivot two values and find their correlation

In [None]:
df[["month", "temp"]].groupby("month", as_index=False).mean().sort_values(by="temp", ascending=False)

In [None]:
df.select_dtypes(object).nunique()

# Sklearn

##  One Hot Encode categorical features 

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
val_cat = ohe.fit_transform(categorical).toarray()
val_cat = pd.DataFrame(val_cat)
val_cat

In [None]:
df_ohe = pd.concat([val_cat, num], axis=1)

In [None]:
df_ohe.sample(10)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(df_ohe.values)
y = df["temp"]

## KFold validation

In [None]:
from sklearn.model_selection import KFold, train_test_split
from sklearn import linear_model
from sklearn.metrics import r2_score

In [None]:
X = df.drop(columns=["FFMC", "month", "day"])
y = df["FFMC"]

In [None]:
kf = KFold(n_splits = 10, shuffle = True)

kf.get_n_splits(X)

In [None]:
import numpy as np
reg = linear_model.LinearRegression()

results = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.loc[train_index,], X.loc[test_index,]
    y_train, y_test = y[train_index], y[test_index]
    reg.fit(X_train, y_train)
    predictions = reg.predict(X_test)
    print("R2: ",r2_score(y_test,predictions))
    results.append(r2_score(y_test, predictions))
    
print("R2 medio: ", np.mean(results))

## Neural networks

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

In [None]:
X = df.drop(columns=["FFMC", "month", "day"])
y = df["FFMC"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state=42)

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
clf = MLPRegressor(solver="lbfgs", alpha=1e-5, hidden_layer_sizes=(5, ), activation="logistic", 
                    max_iter= 100000, warm_start=True)

model = clf.fit(X_train_scaled, y_train)

predictions = model.predict(X_test_scaled)
print("R2: ", r2_score(y_test, predictions))

Split the dataset for training a ML model

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
                                        X,
                                        y,
                                        train_size   = 0.7,
                                        random_state = 1234,
                                        shuffle      = True
                                    )

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train, y_train)
rf_predict = rf.predict(X_test)

In [None]:
print("R2: ", r2_score(y_test, rf_predict))

## Feature selection w/ statsmodels

In [None]:
from sklearn import linear_model
import statsmodels.api as sm # https://pypi.org/project/statsmodels/
import pandas as pd

In [None]:
X = df.drop(columns=["FFMC", "month", "day"])
y = df["FFMC"]

In [None]:
regr = linear_model.LinearRegression()
regr.fit(X, y)
predictions = regr.predict(X)

In [None]:
X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

Features with P> value around 0.5 are better to be deleted. 0.0 values may offer the best results

## Feature selection w/ RFE

In [None]:
from sklearn.feature_selection import RFE
from sklearn import linear_model
import pandas as pd

In [None]:
regr = linear_model.LinearRegression()
selector = RFE(estimator=regr, n_features_to_select = 5, verbose=4)
selector.fit(X, y)

In [None]:
print(selector.ranking_)

In [None]:
X.columns[selector.support_]

## PCA

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [None]:
pca = PCA(n_components = 2)
pca.fit(X)

In [None]:
pca.explained_variance_ratio_

In [None]:
newval = pca.transform(X)

In [None]:
plt.scatter(newval[:,0], newval[:,1])

# Seaborn and Matplotlib

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

For ploting all the featurs at once

In [None]:
plt.figure(figsize=(10, 20))
for i in range(0, len(numerical_cols)):
    plt.subplot(4, int(len(numerical_cols)/3), i+1)
    sns.boxplot(y=df[numerical_cols[i]], color='gray', orient='v')
    plt.tight_layout()

## Correlation's heatmap

In [None]:
f, ax = plt.subplots(figsize= (10,10))
sns.heatmap(df.corr(), cmap="RdYlGn", annot=True)

In [None]:

f, ax = plt.subplots(figsize= (10,10))
df_corr = df.corr().loc[:, ["temp"]].sort_values("temp", ascending=False) #change the target feature
sns.heatmap(df_corr, annot=True, cmap="RdYlGn", vmin=-1, vmax=1)

In [None]:
plt.figure(figsize=[30,30])
sns.FacetGrid(df, col="month").map(plt.hist, "temp", bins=20)

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
import plotly.express as px
import seaborn as sns

In [None]:
init_notebook_mode(connected=True)
pal = sns.color_palette("mako_r", 12).as_hex()[:10]
bact=train_d.target.value_counts(normalize=True).reset_index()
bact.target=bact.target.mul(100).sort_values(ascending=False)
bact['index']=bact['index'].str.replace('_', ' ') 

temp = dict(layout=go.Layout(font=dict(family="Franklin Gothic", size=12)))
fig = px.bar(bact, x='index', y='target', text='target', color='index', 
             color_discrete_sequence=pal, opacity=0.8)
fig.update_traces(texttemplate='%{text:,.2f}%', textposition='outside',
                  marker_line=dict(width=1, color='#28221D'))
fig.update_yaxes(visible=False, showticklabels=False)
fig.update_layout(template=temp, title_text='Distribution of Bacteria Species', 
                  xaxis=dict(title='', tickangle=25, showline=True), 
                  height=450, width=700, showlegend=False)
fig.show()

## I'll add more, let me know if you have useful code to share.