In [None]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


There are 2 csv files in the current version of the dataset:


In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


The next hidden code cells define functions for plotting data. Click on the "Code" button in the published kernel to reveal the hidden code.

In [None]:
# Distribution graphs (histogram/bar graph) of column data
def plotPerColumnDistribution(df, nGraphShown, nGraphPerRow):
    nunique = df.nunique()
    df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]] # For displaying purposes, pick columns that have between 1 and 50 unique values
    nRow, nCol = df.shape
    columnNames = list(df)
    nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
    plt.figure(num = None, figsize = (6 * nGraphPerRow, 8 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')
    for i in range(min(nCol, nGraphShown)):
        plt.subplot(nGraphRow, nGraphPerRow, i + 1)
        columnDf = df.iloc[:, i]
        if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
            valueCounts = columnDf.value_counts()
            valueCounts.plot.bar()
        else:
            columnDf.hist()
        plt.ylabel('counts')
        plt.xticks(rotation = 90)
        plt.title(f'{columnNames[i]} (column {i})')
    plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
    plt.show()


In [None]:
# Correlation matrix
def plotCorrelationMatrix(df, graphWidth):
    filename = df.dataframeName
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    plt.title(f'Correlation Matrix for {filename}', fontsize=15)
    plt.show()


In [None]:
# Scatter and density plots
def plotScatterMatrix(df, plotSize, textSize):
    df = df.select_dtypes(include =[np.number]) # keep only numerical columns
    # Remove rows and columns that would lead to df being singular
    df = df.dropna('columns')
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    columnNames = list(df)
    if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
        columnNames = columnNames[:10]
    df = df[columnNames]
    ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
    corrs = df.corr().values
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (-0.5, 0.5), xycoords='axes fraction', ha='center', va='center', size=textSize)
    plt.suptitle('Scatter and Density Plot')
    plt.xticks(rotation=90)
    plt.show()


### Let's check 1st file: /kaggle/input/Control.csv

In [None]:
nRowsRead = 1000 # specify 'None' if want to read whole file
# Control.csv may have more rows in reality, but we are only loading/previewing the first 1000 rows
df1 = pd.read_csv('/kaggle/input/Control.csv', delimiter=',', nrows = nRowsRead)
df1.dataframeName = 'Control.csv'
nRow, nCol = df1.shape
print(f'There are {nRow} rows and {nCol} columns')

In [None]:
df1.head(5)

Distribution graphs (histogram/bar graph) of sampled columns:

In [None]:
# plotPerColumnDistribution(df1, 10, 5)

Correlation matrix:

In [None]:
# plotCorrelationMatrix(df1, 24)

Scatter and density plots:

In [None]:
# plotScatterMatrix(df1, 20, 10)

### Let's check 2nd file: /kaggle/input/Quality.csv

In [None]:
nRowsRead = 1000 # specify 'None' if want to read whole file
# Quality.csv may have more rows in reality, but we are only loading/previewing the first 1000 rows
df2 = pd.read_csv('/kaggle/input/Quality.csv', delimiter='\t', nrows = nRowsRead)
df2.dataframeName = 'Quality.csv'
nRow, nCol = df2.shape
print(f'There are {nRow} rows and {nCol} columns')

In [None]:
# df1 = df1.drop('Unnamed: 0', axis=1)


In [None]:
df2[['Stippe_-3000','Stippe_-3000.1','Stippe_-3000.2','Stippe_-3000.3','Stippe_-3000.4']].median()

In [None]:
df_quality = df2
df_control = df1

In [None]:
df_quality.shape , df_control.shape


In [None]:
df_control.head()
df_control = df_control.drop(['Unnamed: 0'], axis=1)

In [None]:
df_quality = df_quality.fillna(0)
df_control = df_control.fillna(0)
# df_control = df_control.drop('date', axis=1)

In [None]:
df_quality.shape , df_control.shape

In [None]:
df_quality = df_quality.drop(['Unnamed: 0'], axis=1)

In [None]:
df_control = df_control.drop('date', axis=1)

In [None]:
df_quality.head()

In [None]:
df_quality = df_quality.set_index('date')

In [None]:
stp_str = 'Stippe_-3000'

In [None]:
treshold = 47.5
df_quality[df_quality[stp_str] > treshold][stp_str]

In [None]:
# df_quality[stp_str] > 55

color = np.where(df_quality[stp_str] > treshold ,'r','black')

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
sc = StandardScaler()
arr_control = sc.fit_transform(df_control)
arr_quality = sc.fit_transform(df_quality)

In [None]:
df_control = pd.DataFrame(arr_control, columns=df_control.columns, index= df_control.index)
df_quality = pd.DataFrame(arr_quality, columns=df_quality.columns, index= df_quality.index)

In [None]:
df_quality.shape , df_control.shape

In [None]:
# df_quality.head(), df_control.head()

Чисто посмотреть

In [None]:
import umap
import seaborn as sns
%matplotlib inline
sns.set(context='notebook', style='white', rc={'figure.figsize':(9,10)})
fit = umap.UMAP(n_neighbors=40, min_dist=0.05, random_state=42)
%time embedding = fit.fit_transform(df_control)

In [None]:
plt.scatter(embedding[:, 0], embedding[:, 1], c=color)
plt.title('df_control сжатые UMAP', fontsize=24);
plt.show()


Чисто посмотреть закончилось

In [None]:
df_stippe = df_quality[stp_str]

In [None]:
df_stippe.shape

In [None]:
df_stippe.head()

In [None]:
from sklearn import linear_model,metrics
# from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split

In [None]:
lasso = linear_model.Lasso(alpha=0.1, tol=0.2)
lasso.fit(df_control,df_stippe)
# predictions_lasso = lasso.predict(test_data)

In [None]:
# lasso.coef_

In [None]:
ridge = linear_model.Ridge(alpha=0.1)
ridge.fit(df_control,df_stippe)

In [None]:
top_param_lasso = pd.DataFrame([df_control.columns, lasso.coef_]).T
top_param_lasso.columns = ['param', 'coef']

top_param_ridge = pd.DataFrame([df_control.columns, ridge.coef_]).T
top_param_ridge.columns = ['param', 'coef']

In [None]:
top_param_lasso[top_param_lasso['coef']>0]

In [None]:
top_param_lasso['coef'].min(), top_param_lasso['coef'].max()

In [None]:
top_param_ridge

In [None]:
top_param_ridge['coef'].min(), top_param_ridge['coef'].max()

In [None]:
top_param_lasso =  top_param_lasso[top_param_lasso['coef'] > 0 ]
# top_param_lasso.head()

df_select_lasso = df_control[top_param_lasso['param']]


top_param_ridge =  top_param_ridge[top_param_ridge['coef'] > 0 ]
# top_param_ridge.head()

df_select_ridge = df_control[top_param_ridge['param']]

In [None]:
df_select_lasso.head()

In [None]:
df_select_lasso.columns, df_select_ridge.columns

In [None]:
df_stippe = pd.DataFrame(df_stippe)

In [None]:
df_stippe.head()
# color

In [None]:
import umap
import seaborn as sns
%matplotlib inline
sns.set(context='notebook', style='white', rc={'figure.figsize':(9,10)})

In [None]:
n_neighbors=15

In [None]:
fit = umap.UMAP(n_neighbors=50, min_dist = 0.99, random_state=42)
%time embedding_lasso = fit.fit_transform(df_select_lasso)

In [None]:
plt.scatter(embedding_lasso[:, 0], embedding_lasso[:, 1], c=color)
plt.title('Данные сжатые UMAP lasso', fontsize=24);
plt.show()

In [None]:
fit = umap.UMAP(n_neighbors=15, min_dist = 0.99, random_state=42)
%time embedding_ridge  = fit.fit_transform(df_select_ridge)

In [None]:
plt.scatter(embedding_ridge[:, 0], embedding_ridge[:, 1], c=color)
plt.title('Данные сжатые UMAP ridge', fontsize=24);
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

df_quality

In [None]:
df2[stp_str] > treshold-2.5

In [None]:
X = embedding
y = df2[stp_str] > (treshold-2.5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.metrics import roc_curve, auc

In [None]:
neighbors = list(range(1,30))
train_results = []
test_results = []
for n in neighbors:
   model = KNeighborsClassifier(n_neighbors=n)
   model.fit(X_train, y_train)
   train_pred = model.predict(X_train)
   false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   train_results.append(roc_auc)
   y_pred = model.predict(X_test)
   false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(neighbors, train_results, 'b', label="Train AUC")
line2, = plt.plot(neighbors, test_results, 'r', label="Test AUC")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('AUC score')
plt.xlabel('n_neighbors')
plt.show()

In [None]:
neigh = KNeighborsClassifier(n_neighbors=21, n_jobs=-1)
neigh.fit(X_train, y_train)

In [None]:
pred = neigh.predict(X)

In [None]:
res = pd.DataFrame(data=[y, pred], ).T
res.columns = columns=['black', 'claster']

In [None]:
res[res['black'] == 1]

In [None]:
color = np.where(res['claster'],'red','black')
plt.scatter(embedding[:, 0], embedding[:, 1], c=color,cmap='Spectral')
plt.title('Данные сжатые UMAP KNN', fontsize=24);
plt.show()

In [None]:
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

In [None]:
X = df_control
y = df2[stp_str] > treshold
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
kmeans = KMeans(n_clusters=2, random_state=42).fit(X)
pred = kmeans.predict(X)

In [None]:
res = pd.DataFrame(data=[y, pred], ).T
res.columns = columns=['black', 'claster']

In [None]:
res[res['black'] == 1]

In [None]:
color = np.where(res['claster'] == 0,'r','black')
plt.scatter(embedding[:, 0], embedding[:, 1], c=color,cmap='Spectral')
plt.title('Данные сжатые UMAP KMeans', fontsize=24);
plt.show()

In [None]:
X = embedding_lasso
y = df2[stp_str] > treshold

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
kmeans = KMeans(n_clusters=2, random_state=42).fit(X_train)
pred = kmeans.predict(X)

In [None]:
res = pd.DataFrame(data=[y, pred], ).T
res.columns = columns=['black', 'claster']

In [None]:
res[res['claster'] == 1]

In [None]:
color = np.where(res['claster'] == 1,'r','black')
plt.scatter(embedding_lasso[:, 0], embedding_lasso[:, 1], c=color,cmap='Spectral')
plt.title('Данные сжатые UMAP lasso KMeans', fontsize=24);
plt.show()

K_means embedding_ridge

In [None]:
X = embedding_ridge
y = df2[stp_str] > treshold

In [None]:
len(X), len(y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
kmeans = KMeans(n_clusters=2, random_state=42).fit(X_train)

In [None]:
pred = kmeans.predict(X)

In [None]:
res = pd.DataFrame(data=[y, pred], ).T
res.columns = columns=['black', 'claster']

In [None]:
res[res['black'] == 1]

In [None]:
color = np.where(res['claster'] == 1,'r','black')
plt.scatter(embedding_ridge[:, 0], embedding_ridge[:, 1], c=color,cmap='Spectral')
plt.title('Данные сжатые UMAP ridge KMeans', fontsize=24);
plt.show()

Рассмотрим большой кластер(Розовый). 
Для начала нам необходимо развернуть данные

In [None]:
embedding_ridge

In [None]:
embedding = pd.

In [None]:
control_claster = df1[pred == 1]

In [None]:
control_claster = control_claster.drop('Unnamed: 0', axis=1)

In [None]:
control_claster.head()

In [None]:
# plotScatterMatrix(control_claster, 20, 10)

In [None]:
df_stippe[predictions]

In [None]:
control_claster[stp_str] = df_stippe[predictions]

In [None]:
plotScatterMatrix(control_claster, 20, 10)

In [None]:
sns.pairplot(control_claster, hue="species", palette="husl")

In [None]:
def plot_(clf_name, predictions):
    color = np.where(predictions,'r','black')
    plt.scatter(embedding_ridge[:, 0], embedding_ridge[:, 1], c=color)
    plt.title('Данные сжатые UMAP ridge '+clf_name, fontsize=24);
    plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [None]:
# rf = RandomForestClassifier(random_state = 42)
# n_estimators = [int(x) for x in np.linspace(start = 20, stop = 50, num = 5)]
# params = {'bootstrap': [True, False],
#  'max_depth': [10, 20, 30, 40, None],
#  'max_features': ['auto', 'sqrt'],
#  'min_samples_leaf': [1, 2, 4, 5],
#  'min_samples_split': [2, 5, 10, 15],
#  'n_estimators': n_estimators}

In [None]:
# clf = GridSearchCV(rf, params, cv = 3, n_jobs = -1, verbose = 2)

In [None]:
# %time clf.fit(X_train, y_train)

In [None]:
# best_clf = clf.best_estimator_

In [None]:
# predictions = clf.best_estimator_.predict(X_test)
# scores = cross_val_score(best_clf, X, y, cv=5)
# scores.mean()

In [None]:
# predictions = best_clf.predict(X)

In [None]:
# plot_('randomforest', predictions)

In [None]:
from sklearn.metrics import confusion_matrix
# confusion_matrix(y, predictions), 

In [None]:
# y = y.astype(numpy.float32)
# predictions = predictions.astype(numpy.float32)

In [None]:
from sklearn.metrics import accuracy_score, r2_score
# accuracy_score(y, predictions), r2_score(y, np.array(predictions))

In [None]:
# def svc_estimator_selection(X, y, nfolds):
#     Cs = [0.001, 0.01, 0.1, 1, 10]
#     gammas = [0.001, 0.01, 0.1, 1]
#     param_grid = {'C': Cs, 'gamma' : gammas}
#     grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds, n_jobs = -1)
#     grid_search.fit(X, y)
#     return grid_search.best_estimator_

In [None]:
# best_svc = svc_estimator_selection(X_train, y_train, 5)

In [None]:
# predictions = best_svc.predict(X_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

# param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
#               "base_estimator__splitter" :   ["best", "random"],
#               "n_estimators": [1, 2,3,4,5,6]
#              }


# DTC = DecisionTreeClassifier(random_state = 11, max_features = "auto")

# ABC = AdaBoostClassifier(base_estimator = DTC)

# # run grid search
# grid_search_ABC = GridSearchCV(ABC, param_grid=param_grid, scoring = 'roc_auc', n_jobs = -1, verbose = 2)

In [None]:
# grid_search_ABC.fit(X_train, y_train)

In [None]:
# ada_best = grid_search_ABC.best_estimator_

In [None]:
# predicts = ada_best.predict(X)

In [None]:
# accuracy_score(np.array(y), predictions)

In [None]:
# predictions/

In [None]:
# plot_('ADA', predicts)

In [None]:
# clf.best_params_

In [None]:
# best_params = {'bootstrap': True,
#  'max_depth': 10,
#  'max_features': 'auto',
#  'min_samples_leaf': 1,
#  'min_samples_split': 10,
#  'n_estimators': 28}

In [None]:
# from sklearn.manifold import TSNE

In [None]:
# %time tsne_embedding = TSNE(n_components=2, random_state=42).fit_transform(df_select_ridge)


In [None]:
# plt.scatter(tsne_embedding[:, 0], tsne_embedding[:, 1], s= 20, c=color)
# plt.title('Данные сжатые TSNE', fontsize=24);