In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import scipy.stats as st

sns.set_theme(style="darkgrid")
random.seed(10)

#### Data generating

In [None]:
#### RANDOM VARIABLES ######
random.random() # random number between 0 and 1
random.randint(0,100) # random int between 0 and 100
np.random.randint(0,100, size=5) # 5 random ints between 0 and 100
np.random.sample() # random float between 0 and 1

#### LISTS ####
[x*0.111 for x in range(0, 100)] # incremental list 0 to 100 multiplied by .111
np.arange(3) # 0,1,2

#### DISTRIBUTIONS ####

mu, sigma, n = 0, 1, 1000
x = np.random.normal(mu, sigma, n)

np.random.lognormal(mean=1.,sigma=.4,size=10000) #log normal distrbution

n, p = 10, .5  # number of trials, probability of each trial
s = np.random.binomial(n, p, 1000)

l, h, t = -1,0,1000 # from, to, number of points
np.random.uniform(l, h, t)

np.random.poisson(5, 10000)


#### Math functions

In [None]:
df.var(), series.var()  # variance
series.mean(), series.median(), series.std() # mean, median, standard deviation
series1.cov(s2) # covariance of two series
series.rank() # assigns rank to series in order of magnitude


#### DataFrame handling

In [None]:
pd.DataFrame(data={'w_a': w_a, 'w_b': w_b}) # arrays to DF
df.sample(n) # random n rows of df

wine_df_original["pred_prob"] = y_prob[:,1] #getting only probabilities


#### Plotting

In [None]:
plt.tight_layout()

#scatterplot
sns.scatterplot(data=df[df.model == 'a'],x='x', y='y', alpha=0.3) #with dataframe
sns.scatterplot(x=x, y=y, alpha=0.3,ax=ax[0]) # with arrays, no df

# regression plot
sns.regplot(data=df[df.label == 1],x='x', y='y',scatter_kws={'alpha':0.1})

# PDF plot
sns.kdeplot(data=df.x)

#histogram
sns.histplot(data=penguins, x="flipper_length_mm")

#line plot (with labels)
sns.lineplot(x=fpr,y=tpr).set(xlabel="False Positive Rate",ylabel = "True Positive Rate")


#### Regressions / Classification

In [None]:
from sklearn import linear_model

#### Lasso
clf = linear_model.Lasso(alpha=0.1)
clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
print(clf.coef_, clf.intercept_)

#### Ridge
ridge = Ridge(alpha = 10**10, normalize = True)
ridge.fit(X_train, y_train) 
pred = ridge.predict(X_test)

### KMeans Clustering
from sklearn.cluster import KMeans
res0 =  KMeans(n_clusters=2,algorithm="full",max_iter=30).fit(X0)

# Split data into training and test sets
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.5)

# Grid Search (hyperparameter tuning)
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
clf = GridSearchCV(svc, parameters)
clf.fit(X,y)
clf.cv_results_
