## Load data

In [None]:
pd.read_csv() 

## Peak at data
- missing data?
- obvious outliers?
- linear or non-linear correlations?
- strong correlation between depedent terms? (drop one? combine the two?)
- consider PCA, or other dimensionality reduction?

In [None]:
df.hist( figsize=(10,10) )  # will make matrix of histograms for all numeric variables
pd.plotting.scatter_matrix( figsize=(10,10) )   # will make matrix of scatter plots with hist down the diagonal
df.describe()
df.corr() 
df.var.value_counts() # will tabulate categorical variables
for key in df.keys(): # will count up null values in each column
    print(key,sum(df[key].isnull()))

## Clearn by drop/replace null values or outliers

In [None]:
df.dropna(subset=['var'])  # rows with var as nan
df.dropna()   # all rows with any nan
df.drop('var', axis=1) # Drop column
df['var'].fillna(median, inplace=True) # Fill nan with median, mean, etc

# from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median') # ONLY works on numerical data
X = imputer.fit_transform(df)
df = pd.Datafram(X, columns=df.columns, index=df.index) # Transform from numpy matrix to dataframe

## Categorical one-hot

In [None]:
# Note that this creates sparse matrix
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
df_1hot = cat_encoder.fit_transform(df_cat)

## Transformation pipeline

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('one_hot', OneHotEncoder()),
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', cat_pipeline, cat_attribs),
])

X_prepared = preprocessor.fit_transform(X)

## Train, dev, test split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

## Setup model pipeline and test

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

clf = Pipeline(
    steps=[('preprocessor',preprocessor),('classifier', LinearRegression())]
)

clf.fit(X_train, y_train)
print('model score: {:4.2f}'.format(clf.score(X_dev, y_dev)))

In [None]:
# Visualize pipeline
from sklearn import set_config

set_config(display="diagram")
clf

## Cross validation

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(clf, X_train, y_train, cv=5, scoring="accuracy")

## Hyper-parameter grid search

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'classifier__C':[0.1, 1, 10]},
    {'classifier__penalty':['l2','l1']},
]

grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', return_train_score=True)

## Logisitic Regression