In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
plt.rc('font', size=14)
import seaborn as sns
sns.set(style='whitegrid', color_codes=True, rc={'figure.figsize':(11,8)}, font_scale=1.5)

In [None]:
_DATA_DIR = '/classes/20800_winter2024/Data'

In [None]:
! python3 --version

# Lending Club example 

In [None]:
# load data
loan = pd.read_csv('%s/loan.csv'%(_DATA_DIR), index_col = 0)

# 0: charged off, 1: fully paid, -1: other
loan = loan[loan['Paid']!= -1]
loan['log_inc'] = loan['annual_inc'].apply(lambda x: np.log(x))

In [None]:
loan

In [None]:
loan.shape

In [None]:
loan.iloc[:,0:3].tail(10)

In [None]:
# Countplot
sns.countplot(x='Paid', data = loan, palette ='hls')
plt.xlabel('Default category')
plt.ylabel('Size')
plt.title('Sample size for default/ non-default')
#plt.savefig('%s/LendingClub/Counterplot_1.pdf'%(_FIGURE_DIR))
plt.show()

In [None]:
# Histgram 1
sns.distplot(loan.loan_amnt)
plt.xlabel('Loan amount')
plt.title('Histgram for Loan amount')
#plt.savefig('%s/LendingClub/Histgram_1.pdf'%(_FIGURE_DIR))
plt.show()

In [None]:
# Histgram 2
sns.distplot(loan.int_rate)
plt.xlabel('Interest rate')
plt.title('Histgram for Interest rate')
# plt.savefig('%s/LendingClub/Histgram_2.pdf'%(_FIGURE_DIR))
plt.show()

In [None]:
sns.kdeplot(loan['log_inc'][loan['Paid'] == 0], shade=True, color="r", label="Paid = 0", bw_adjust=3)
sns.kdeplot(loan['log_inc'][loan['Paid'] == 1], shade=True, color="b", label="Paid = 1", bw_adjust=3)
plt.title('The effect of Annualized Income')
plt.xlabel('Log Income')
plt.legend()
plt.show()

In [None]:
sns.kdeplot(loan['int_rate'][loan['Paid'] == 0], shade=True, color="r", label="Paid = 0", bw_adjust=3)
sns.kdeplot(loan['int_rate'][loan['Paid'] == 1], shade=True, color="b", label="Paid = 1", bw_adjust=3)
plt.title('The effect of Interest Rate')
plt.xlabel('Interest rate')
plt.legend()
plt.show()

In [None]:
table = pd.crosstab(loan['application_type'], loan.Paid)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked = True)
plt.title('Stacked Bar Chart of Application Type')
plt.xlabel('Loan type')
plt.ylabel('Proportion of Applicants')
plt.show()

In [None]:
table = pd.crosstab(loan['grade'], loan.Paid)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked = True)
plt.title('Stacked Bar Chart of grade')
plt.xlabel('Loan type')
plt.ylabel('Proportion of Applicants')

In [None]:
# conditional bar plot 1
sns.boxplot(x="Paid", y="log_inc", data=loan)
plt.title('Conditional Bar plots for log Income')
plt.xlabel('Default category')
plt.ylabel('Log Income')
plt.show()

In [None]:
# conditional bar plot 2
loan = loan.sort_values(['grade'], ascending=True).reset_index(drop=True)
plt.figure(figsize = (16,12))
sns.boxplot(x="Paid", y="log_inc", hue="grade", data=loan)
plt.xlabel('Dafault category')
plt.ylabel('Log Income')
plt.title('Conditional Bar plots for Log Income')
plt.show()

In [None]:
# Scatter plot 1
sns.scatterplot(x="log_inc", y="loan_amnt",  hue="Paid", data=loan)
plt.xlabel('Log Income')
plt.ylabel('Loan amount')
plt.title('Scatter plots for Loan amount V.S. Log Income')
plt.show()

In [None]:
# add one more dimension by changing the style and hue
sns.scatterplot(x="log_inc", y="loan_amnt",  hue="grade", style="Paid", data=loan);
plt.xlabel('Log Income')
plt.ylabel('Loan amount')
plt.title('Scatter plots for Loan amount V.S. Log Income')
plt.show()

# CAPM example

In [None]:
# load data
stocks = pd.read_csv('%s/dj30.csv'%(_DATA_DIR))

# select AAPL as an example
stock = stocks[stocks.TICKER == 'AAPL'][['date','RET','PRC']].set_index('date')
stock.index = stock.index.astype(str)

In [None]:
stocks

In [None]:
# Get the split date index
idx = stock.index.tolist()
stock_split = '20200824'
idx.index(stock_split)

In [None]:
plt.figure(figsize = (20,8))
plt.vlines(1168, 100, 500,color="red", linestyles ="dashed",lw = 3)
plt.annotate('Apple announces 4-1 stock split', xy = (1190, 300),xytext = (1250, 350),arrowprops = dict(facecolor = 'black', shrink = 0.005, width = 5, headwidth = 26,headlength = 8))

plt.xlabel('Time period')
plt.ylabel('Stock price')
plt.title('Time series for Apple Stock Price')

stock.PRC.plot()


In [None]:
# time serie of return
plt.figure(figsize = (20,8))
plt.vlines(1168, stock.RET.min(), stock.RET.max(),color="red", linestyles ="dashed",lw = 3)
plt.annotate('Apple announces 4-1 stock split', xy = (1190, 0.05),xytext = (1250,0.1),arrowprops = dict(facecolor = 'black', shrink = 0.005, width = 5, headwidth = 26,headlength = 8))

plt.xlabel('Time period')
plt.ylabel('Stock Return')
plt.title('Time series for Apple Stock Return')

stock.RET.plot()


In [None]:
# time serie of return
plt.figure(figsize = (20,8))
plt.vlines(1168, -0.5, 2.5,color="red", linestyles ="dashed",lw = 3)
plt.annotate('Apple announces 4-1 stock split', xy = (1190, 1),xytext = (1250,1.5),arrowprops = dict(facecolor = 'black', shrink = 0.005, width = 5, headwidth = 26,headlength = 8))

plt.xlabel('Time period')
plt.ylabel('Stock Cumulative Return')
plt.title('Time series for Apple Stock Cumulative Return')

np.cumsum(stock.RET).plot()
# plt.savefig('%s/CAPM/Time_series_3.pdf'%(_FIGURE_DIR))


In [None]:
# Comparison

stocks['date'] = pd.to_datetime(stocks['date'], format='%Y%m%d')

AAPL = stocks[stocks.TICKER == 'AAPL'][['date','RET','PRC']]
AAPL['CR'] = np.cumsum(AAPL.RET)
IBM = stocks[stocks.TICKER == 'IBM'][['date','RET','PRC']]
IBM['CR'] = np.cumsum(IBM.RET)
JPM = stocks[stocks.TICKER == 'JPM'][['date','RET','PRC']]
JPM['CR'] = np.cumsum(JPM.RET)

fig, ax = plt.subplots(1, 1, figsize = (20, 8))
plt.plot(AAPL.date, AAPL.CR, color = 'maroon')
plt.plot(IBM.date, IBM.CR, color = 'gray')
plt.plot(JPM.date, JPM.CR, color = 'green')
plt.xlabel('Time period')
plt.ylabel('Cumulative Return')
plt.title('Time series for different stock cumulative return')
plt.legend(['AAPL','IBM','JPM']);


In [None]:
stocks = pd.read_csv('%s/dj30.csv'%(_DATA_DIR))
riskfree = pd.read_csv('%s/DTB3.csv'%(_DATA_DIR))
riskfree['rf'] = riskfree.DTB3 /25200
riskfree['date'] = pd.to_datetime(riskfree['DATE']).dt.strftime('%Y%m%d').astype(int)
stocks = pd.merge(stocks, riskfree, on="date")

import statsmodels.formula.api as smf
unique_stock = stocks.TICKER.unique()
coeff = pd.DataFrame(unique_stock)
coeff.columns = ['TICKER']

for i in range(len(unique_stock)):
    mystock = unique_stock[i]
    df = stocks[stocks.TICKER == mystock].set_index('date')[['RET','MrkRet','rf']]
    reg = smf.glm(formula='(RET-rf) ~ (MrkRet-rf)', data=df).fit()
    alpha = reg.params[0]
    beta = reg.params[1]
    coeff.loc[i,'alpha'] = alpha.tolist()
    coeff.loc[i,'beta'] = beta.tolist()
    

In [None]:
# bar chart
coeff = coeff.sort_values(['alpha'], ascending=False).reset_index(drop=True)
plt.figure(figsize = (16,12))
pal = sns.color_palette("Reds_r",31)
sns.barplot(
    x="alpha", 
    y="TICKER", 
    data=coeff, 
    estimator=sum,
    palette=pal
);
plt.title('Alpha for CAPM')

plt.show()


In [None]:
coeff = coeff.sort_values(['beta'], ascending=False).reset_index(drop=True)
plt.figure(figsize = (16,12))
pal = sns.color_palette("Blues_r",31)
sns.barplot(
    x="beta", 
    y="TICKER", 
    data=coeff, 
    estimator=sum,
    palette=pal
);
plt.title('Beta for CAPM')

plt.show()


In [None]:
# Scatter plot with text
plt.plot(coeff.beta, coeff.alpha,'o')

for line in range(0,coeff.shape[0]):
     plt.text(coeff.beta[line]+0.02, coeff.alpha[line], coeff.TICKER[line], 
              horizontalalignment='left', size='medium', color='black', weight='semibold')
plt.title('Sactter plot for CAPM result')

plt.show()


In [None]:
print(reg.summary())

# Maps

In [None]:
# !pip install geopandas # install geopandas if you don't have it

In [None]:
import geopandas as gpd
zips = pd.read_csv('%s/zipcode.csv'%(_DATA_DIR))
store_opening =  pd.read_csv('%s/store_openings.csv'%(_DATA_DIR))
walmart = temp = pd.merge(store_opening, zips, left_on = ['STRCITY','STRSTATE'],right_on = ['city','state'], how = 'left')
walmart.drop_duplicates('storenum', inplace = True)

# Get the geometric information
geo_walmart = gpd.GeoDataFrame(walmart, geometry=gpd.points_from_xy(walmart.longitude, walmart.latitude))
geo_walmart['Date']  = geo_walmart['OPENDATE'].apply(lambda x: x[-5:-1])


In [None]:
# Get the USA map
usa_map = gpd.read_file("%s/cb_2018_us_state_20m.shp"%(_DATA_DIR))
#usa = usa_map[~usa_map.NAME.isin(['Alaska','Hawaii','Puerto Rico'])]
usa = usa_map[~usa_map.index.isin([7,25,48])]
usa.plot()
plt.axis('off')

In [None]:
plt.figure(figsize = (16,12))
ax = usa.boundary.plot(edgecolor = 'black')

geo_walmart[(geo_walmart.type_store == 'Supercenter')&(geo_walmart.Date <'1986')].plot(ax=ax, color = 'palegreen',label = 'Supercenter',)
geo_walmart[(geo_walmart.type_store == 'Wal-Mart')&(geo_walmart.Date <'1986')].plot(ax=ax, color = 'orange',label = 'Walmat')
plt.legend()
plt.axis('off')
plt.title('Walmart Expansion (1985)')
plt.show()

In [None]:
plt.figure(figsize = (16,12))
ax = usa.boundary.plot(edgecolor = 'black')

geo_walmart[(geo_walmart.type_store == 'Supercenter')&(geo_walmart.Date <'1996')].plot(ax=ax, color = 'palegreen',label = 'Supercenter',)
geo_walmart[(geo_walmart.type_store == 'Wal-Mart')&(geo_walmart.Date <'1996')].plot(ax=ax, color = 'orange',label = 'Walmat')
plt.legend()
plt.axis('off')
plt.title('Walmart Expansion (1995)')
plt.show()

In [None]:
plt.figure(figsize = (16,12))
ax = usa.boundary.plot(edgecolor = 'black')

geo_walmart[(geo_walmart.type_store == 'Supercenter')&(geo_walmart.Date <'2006')].plot(ax=ax, color = 'palegreen',label = 'Supercenter',)
geo_walmart[(geo_walmart.type_store == 'Wal-Mart')&(geo_walmart.Date <'2006')].plot(ax=ax, color = 'orange',label = 'Walmat')
plt.legend()
plt.axis('off')
plt.title('Walmart Expansion (2005)')
plt.show()

# Trees 

In [None]:
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier 
from sklearn import tree

In [None]:
df = loan[['log_inc', 'int_rate','Paid']].dropna()
X = df[['log_inc', 'int_rate']]
y = df.Paid

# Fit the classifier with default hyper-parameters
clf = DecisionTreeClassifier(random_state=1234,max_leaf_nodes=5)
model = clf.fit(X, y)

In [None]:
text_representation = tree.export_text(clf)
print(text_representation)

In [None]:
fig = plt.figure(figsize=(25,20))

_ = tree.plot_tree(clf, 
                   feature_names=['log_inc', 'int_rate'],  
                   class_names=['Charged_off','Fully_Paid'],
                   filled=True)
plt.title('Tree model for predicting default')

# CNN

In [None]:
import random
import sys
  
# adding path to the system path
sys.path.insert(0, '/classes/2080001_spr2022')
  
from data_utils import load_CIFAR10

plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots

# Load the raw CIFAR-10 data.
cifar10_dir = '%s/cifar-10-data'%(_DATA_DIR)

X, Y, _, _ = load_CIFAR10(cifar10_dir)

In [None]:
# Visualize some examples from the dataset.
classes = ['plane','car','bird','cat','deer','dog','frog','horse','ship','truck']
#classes = ['plane', 'car','bird', 'cat']
num_classes = len(classes)
samples_per_class = 10
for y, cls in enumerate(classes):
    idxs = np.flatnonzero(Y == y)
    idxs = np.random.choice(idxs, samples_per_class, replace=False)
    for i, idx in enumerate(idxs):
        plt_idx = i * num_classes + y + 1
        plt.subplot(samples_per_class, num_classes, plt_idx)
        plt.imshow(X[idx].astype('uint8'))
        plt.axis('off')
        if i == 0:
            plt.title(cls)
# plt.savefig('%s/Graph/Graph_visualiza_1.pdf'%(_FIGURE_DIR))
plt.show()


## MINST

In [None]:
from __future__ import print_function
import time
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits 
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

In [None]:
mnist= load_digits()
X = mnist.data / 255.0
y = mnist.target
print(X.shape, y.shape)


In [None]:
feat_cols = [ 'pixel'+str(i) for i in range(X.shape[1]) ]
df = pd.DataFrame(X,columns=feat_cols)
df['y'] = y
df['label'] = df['y'].apply(lambda i: str(i))
X, y = None, None
print('Size of the dataframe: {}'.format(df.shape))



In [None]:
np.random.seed(42)
rndperm = np.random.permutation(df.shape[0])

In [None]:
for i in range(1,11):
    plt.subplot(2,5,i)
    plt.imshow(mnist.data[i-1].reshape([8,8]),cmap = plt.cm.gray_r)
    plt.text(3,10,str(mnist.target[i-1]))
    plt.xticks([])
    plt.yticks([])

In [None]:
N = 10000
df_subset = df.loc[rndperm[:N],:].copy()
data_subset = df_subset[feat_cols].values
pca = PCA(n_components=3)
pca_result = pca.fit_transform(data_subset)
df_subset['pca-one'] = pca_result[:,0]
df_subset['pca-two'] = pca_result[:,1] 



In [None]:
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(data_subset)



In [None]:
df_subset['tsne-2d-one'] = tsne_results[:,0]
df_subset['tsne-2d-two'] = tsne_results[:,1]

In [None]:
plt.figure(figsize=(16,7))
ax1 = plt.subplot(1, 2, 1)
sns.scatterplot(
    x="pca-one", y="pca-two",
    hue="y",
    palette=sns.color_palette("hls", 10),
    data=df_subset,
    legend="full",
    alpha=0.3,
    ax=ax1
)
ax2 = plt.subplot(1, 2, 2)
sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="y",
    palette=sns.color_palette("hls", 10),
    data=df_subset,
    legend="full",
    alpha=0.3,
    ax=ax2
)

# Text

In [None]:
# !pip install wordcloud

In [None]:
from wordcloud import WordCloud

Trump_word_importance = pd.read_csv('%s/Trump_word_importance.csv'%(_DATA_DIR), index_col = 0)
wc = WordCloud(
    background_color='white', 
    max_words=200, 
    max_font_size=100 , 
    scale=32)

wc.generate_from_frequencies(dict(Trump_word_importance['Occurance']))

plt.figure(figsize = (16,12))
plt.imshow(wc)
plt.axis('off')


In [None]:
from sklearn.manifold import TSNE


# Load topic model dataset
hm = pd.read_csv('%s/Topic_model_result.csv'%(_DATA_DIR), index_col = 0)
hm = np.array(hm)


tsne = TSNE(random_state=2022, perplexity=30, early_exaggeration=120)
embedding = tsne.fit_transform(hm)
embedding = pd.DataFrame(embedding, columns=['x','y'])
embedding['hue'] = hm.argmax(axis=1)
plt.figure(figsize = (12,12))
sns.scatterplot(x='x',y='y',hue='hue', data = embedding)
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('Topic model dimension reduction with t-SNE')

plt.show()
