It seems I needed to do these tasks whenever tackling data challenges. So I've compiled these shortcuts to minimize time spent Googling

# Packages to import

In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import Imputer, scale, normalize
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn import linear_model

from sklearn.model_selection import train_test_split,LeaveOneOut 
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn import metrics

from sklearn.metrics.pairwise import cosine_similarity
import os, shutil, glob

# Get data into Pandas

In [None]:
df = pd.read_csv('breast-cancer-wisconsin.data.txt', sep=",", header=None, dtype=str)
df = pd.read_csv('11-2-16_data_analyst_case_study.csv', sep=",", dtype=str)

In [None]:
df.columns = ['id','clump_thickness','uniformity_size','uniformity_shape']
df.set_index('id', inplace=True)

In [None]:
df.describe(include='all')

#### Change data type

In [None]:
numeric_col = ['clump_thickness','uniformity_size','uniformity_shape']
df[numeric_col] = df[numeric_col].apply(pd.to_numeric, errors='coerce')

df['date_of_birth'] =  pd.to_datetime(df['date_of_birth'])
df[dates_columns] = df[dates_columns].apply(pd.to_datetime, format='%Y%m%d', errors='coerce') 

In [None]:
users_df['sequence'] = users_df['sequence'].str.zfill(12)

# Feature engineering

In [None]:
now = datetime.now()
df['how_long_ago'] = abs(now - df['date_of_birth']).astype('timedelta64[Y]')
df['time_between_dates'] = df['date1']-df['date2']

In [None]:
num_closed_cards = trades_shortdf.groupby('sequence_number').closed_date.count().to_frame()
num_closed_cards.rename(columns={'closed_date': 'num_closed_cards'}, inplace=True)

In [None]:
#create column with most frequently appeared string in another column
eqc_code = df.groupby('sequence_number').equal_credit_opportunity_act_code.agg(lambda x:x.value_counts().index[0]).to_frame()

In [None]:
complete3 = pd.concat([first_df,complete,new_features], axis=1, join='inner')

# Deal with missing data

#### Impute missing data

In [None]:
imp = Imputer(missing_values='NaN', strategy='median', axis=0)
temp = pd.DataFrame(imp.fit_transform(df))
temp.columns = df.columns
temp.index = df.index
df = temp
del temp # to free up memory

#### Drop missing data

In [None]:
#drop entire row when household_size is null
df = df.dropna(how='any', subset=['household_size'])

# Normalize numerical data

In [None]:
temp = pd.DataFrame(normalize(data_df[numeric_col]))
temp.columns = numeric_col
temp.index = data_df.index
data_df[numeric_col] = temp[numeric_col]
del temp # to free up memory

# Dummify categorical data

In [None]:
dummy_sex = pd.get_dummies(data_df.patient_sex, prefix='sex')
dummy_surgeon = pd.get_dummies(data_df.surgeon_id, prefix='surgeon')

ready_df = df
ready_df = ready_df.join([dummy_sex,dummy_surgeon])
ready_df.drop(['patient_sex','surgeon_id'], axis=1, inplace=True)

# Train machine learning model

In [None]:
X = ready_df[ready_df.columns[:-1]].values
y = (ready_df['pass(4)/fail(2)'].values == 4)
features_labels = ready_df.columns
features_labels = features_labels.drop(['pass(4)/fail(2)'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

#### Random Forest Classification

In [None]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
model = clf.fit(X_train, y_train)
predictions = model.predict(X_test) #'model' or 'clf'?

#### Random Forest Regression

In [None]:
regr = RandomForestRegressor(n_estimators=100, n_jobs=-1)
model = regr.fit(X_train, y_train)
predictions = model.predict(X_test) #'model' or 'clf'?

#### Kmeans Clustering

In [None]:
est = KMeans(n_clusters=2, random_state=random_state)
y_pred = est.fit_predict(X)

# Evaluation

In [None]:
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

for f in range(X_train.shape[1]):
    print("%3d) %-*s %f" % (f+1, 30, features_labels[indices[f]], importances[indices[f]]))
    
scores = cross_val_score(model, X, y, scoring='f1') #or r2
print("Cross-validated Training F1-score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print('')

predictions = cross_val_predict(model, X, y)
accuracy = metrics.accuracy_score(y, predictions)
precision = metrics.precision_score(y, predictions)
recall = metrics.recall_score(y, predictions)
roc_auc = metrics.roc_auc_score(y, predictions)

print('Cross-validated Predicted Accuracy:', accuracy)
print('Cross-validated Predicted Precision:', precision)
print('Cross-validated Predicted Sensitivity (aka recall):', recall)
#print('Predicted AUC of ROC:', roc_auc)

http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

# Plotting

In [None]:
sorted_df = data_df
sorted_df['surgeon_id'] = pd.to_numeric(sorted_df['surgeon_id'])
sorted_df.sort_values(['surgeon_id'], ascending=1, inplace=True)
ax = sns.countplot(x="surgeon_id", data=sorted_df, hue="success")

# Exploring data

In [None]:
all_severe = sum(data_df['initial_severity']=='SEVERE')
all_minor = sum(data_df['initial_severity']=='MINOR')
print('%initially severe:  ', all_severe/(all_severe+all_minor))

In [None]:
surgeon6 = data_df['surgeon_id']==6
failed = data_df['success']=='FALSE'
success = data_df['success']=='TRUE'
data_df[surgeon6&success].age.count()

In [None]:
#look at only rows with successful outcomes
success_df = data_df[data_df['success']=='TRUE']
success_df['age'].describe()

# File I/O (unlikely)

In [None]:
original_dir = '~/Documents/something/'

os.chdir(os.path.dirname(original_dir))

TeraStitcher_dir = os.path.dirname(original_dir) + '_TeraStitcher'

original_filefullpath = os.path.join(original_dir,file)

files_in_tile = glob.glob(search_word)
for file in files_in_tile:
    print('do something')

if os.path.exists(TeraStitcher_dir): # if this folder already exists, remove it
    shutil.rmtree(TeraStitcher_dir)
    os.mkdir(TeraStitcher_dir)
    os.chdir(TeraStitcher_dir)