In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.cluster import KMeans

In [2]:
with open('samsung_train.txt') as file_1:
    list_1 = [line.rstrip() for line in file_1]

with open('samsung_test.txt') as file_2:
    list_2 = [line.rstrip() for line in file_2]

merged_data = list_1 + list_2
df_data = pd.DataFrame({'data': merged_data})

In [3]:
with open('samsung_train_labels.txt') as file_3:
    labels_1 = [line.rstrip() for line in file_3]

with open('samsung_test_labels.txt') as file_4:
    labels_2 = [line.rstrip() for line in file_4]

merged_labels = labels_1 + labels_2

df_labels = pd.DataFrame({'labels': merged_labels})

In [4]:
dataset = pd.concat([df_data, df_labels], axis=1)
dataset['data'] = dataset['data'].apply(lambda x: [float(value) for value in x.split()])

In [5]:
X, y = dataset['data'], dataset['labels']
X = np.vstack(X.values)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
clf = SGDClassifier()

clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
print("Pojedynczy wynik: ", score)

Pojedynczy wynik:  0.9754045307443365


In [8]:
cv_score = cross_val_score(clf, X, y, cv=5)
print("wynik kroswalidacji: ", cv_score)
print("średni wynik wszystkich foldów: ", cv_score.mean())

wynik kroswalidacji:  [0.95097087 0.91990291 0.96893204 0.96067961 0.97183099]
średni wynik wszystkich foldów:  0.9544632845617393


In [9]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

models = []
scores = []

for fold_nr, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    X_train = X[train_idx]
    X_test =X[test_idx]

    y_train = y[train_idx]
    y_test = y[test_idx]

    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
  
    clf = SGDClassifier(random_state=1).fit(X_train, y_train)

    models.append(clf)
    scores.append(clf.score(X_test, y_test))


print("wyniki poszczególnych foldów: ", scores)
print("średni wynik wszystkich foldów: ", np.array(scores).mean())

wyniki poszczególnych foldów:  [0.9791262135922331, 0.9781553398058253, 0.9713592233009709, 0.9786407766990292, 0.9771733851384167]
średni wynik wszystkich foldów:  0.976890987707295
