In [0]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import cross_val_score, KFold
from sklearn import preprocessing as prep
from sklearn import utils
from sklearn import svm
from sklearn.model_selection import GridSearchCV
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve

In [0]:
dataset_url = '/content/drive/My Drive/coursera/Week 3/svm-data.csv'
data = pd.read_csv(dataset_url,  header=None)

In [134]:
data

Unnamed: 0,0,1,2
0,0.0,0.7,0.29
1,1.0,0.23,0.55
2,0.0,0.72,0.42
3,0.0,0.98,0.68
4,0.0,0.48,0.39
5,1.0,0.34,0.73
6,0.0,0.44,0.06
7,1.0,0.4,0.74
8,0.0,0.18,0.18
9,1.0,0.53,0.53


In [135]:
X = data.drop([0], axis = 1)
X.shape

(10, 2)

In [136]:
y = data.loc[:, 0]
y.shape

(10,)

In [137]:
clf = svm.SVC(kernel='linear', C=100000, random_state=241)
clf.fit(X, y)

SVC(C=100000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=241, shrinking=True, tol=0.001,
    verbose=False)

In [138]:
clf.support_

array([3, 4, 9], dtype=int32)

#Text analysis

In [0]:
from sklearn import datasets

newsgroups = datasets.fetch_20newsgroups(
                    subset='all', 
                    categories=['alt.atheism', 'sci.space']
             )

In [0]:
 from sklearn.feature_extraction.text import TfidfVectorizer
 vectorizer = TfidfVectorizer()

In [0]:
X = vectorizer.fit_transform(newsgroups.data)

In [0]:
y = newsgroups.target

In [143]:
# return words mapped into features
feature_mapping = vectorizer.get_feature_names()
print(len(feature_mapping))

28382


In [144]:
%%time
grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(n_splits=5, shuffle=True, random_state=241)
clf = svm.SVC(kernel='linear', random_state=241)
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv)
gs.fit(X, y)

CPU times: user 2min 52s, sys: 63.5 ms, total: 2min 52s
Wall time: 2min 52s


In [145]:
print(gs.cv_results_['mean_test_score'])

[0.55263603 0.55263603 0.55263603 0.55263603 0.95016353 0.99328044
 0.99328044 0.99328044 0.99328044 0.99328044 0.99328044]


In [146]:
gs.best_params_

{'C': 1.0}

In [147]:
clf = svm.SVC(kernel='linear', C=1.0, random_state=241)
clf.fit(X,y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=241, shrinking=True, tol=0.001,
    verbose=False)

In [148]:
for i in clf.coef_.toarray():
    print(i)
    break

[ 0.29258057 -0.12314757  0.         ...  0.01972862  0.05831336
 -0.00297347]


In [149]:
%%time
sorted_coeffs = sorted(clf.coef_.todok().items(), key = lambda x: abs(x[1]))[-10:]
words = map(feature_mapping.__getitem__, iter([x[0][1] for x in sorted_coeffs]))
print(' '.join(sorted(words)))

atheism atheists bible god keith moon religion sci sky space
CPU times: user 43 ms, sys: 1 ms, total: 44 ms
Wall time: 47.4 ms


In [150]:
#for afeature in reversed(sorted_coeffs):
#    print(afeature[0][1], feature_mapping[afeature[0][1]])#, afeature[1])
%time result_ok = np.argsort(np.abs(np.asarray(clf.coef_.todense())).reshape(-1))[-10:]

CPU times: user 5.13 ms, sys: 0 ns, total: 5.13 ms
Wall time: 5.62 ms


In [151]:
%%time
word = pd.DataFrame(data=vectorizer.get_feature_names())
coef = pd.DataFrame(data=np.abs(np.asarray(clf.coef_.todense()).reshape(-1)))
data = pd.concat([word, coef], axis=1)
data.columns = ['word', 'coef']
data.sort_values(by=['coef'])[-10:]

CPU times: user 40.4 ms, sys: 0 ns, total: 40.4 ms
Wall time: 42.6 ms


#Logistic regression

In [0]:
dataset_url = '/content/drive/My Drive/coursera/Week 3/data-logistic.csv'
data = pd.read_csv(dataset_url,  header=None)

In [153]:
data

Unnamed: 0,0,1,2
0,-1,-0.663827,-0.138526
1,1,1.994596,2.468025
2,-1,-1.247395,0.749425
3,1,2.309374,1.899836
4,1,0.849143,2.407750
...,...,...,...
200,1,4.245176,3.053931
201,1,2.437935,1.357804
202,-1,-1.876638,1.533398
203,1,-6.824446,-13.934211


In [154]:
X = data.drop([0], axis = 1)
X.shape

(205, 2)

In [155]:
y = data.loc[:, 0]
y.shape

(205,)

In [0]:
def calc_gradient_w1(X: pd.DataFrame, y: pd.Series, w1: float, w2: float, k: float, C: float) -> float:
    y_lenght = len(y)
    sum_elements = 0
    for i in range(y_lenght):
        sum_elements += y[i] * X[1][i] * (1.0 - 1.0 / (1.0 + np.exp(-y[i] * (w1 * X[1][i] + w2 * X[2][i]))))
    return w1 + k / y_lenght * sum_elements - k * C * w1

def calc_gradient_w2(X: pd.DataFrame, y: pd.Series, w1: float, w2: float, k: float, C: float) -> float:
    y_lenght = len(y)
    sum_elements = 0
    for i in range(y_lenght):
        sum_elements += y[i] * X[2][i] * (1.0 - 1.0 / (1.0 + np.exp(-y[i] * (w1 * X[1][i] + w2 * X[2][i]))))
    return w2 + k / y_lenght * sum_elements - k * C * w2

In [0]:
def gradient_descent(X: pd.DataFrame, y: pd.Series, w1: float=0.0, w2: float=0.0, k: float=0.1, C: float=0.0,
                     precision: float=1e-5, max_iterations: int=10000) -> [float, float]:
    for i in range(max_iterations):
        w1_previous, w2_previous = w1, w2
        w1, w2 = calc_gradient_w1(X, y, w1, w2, k, C), calc_gradient_w2(X, y, w1, w2, k, C)
        if np.sqrt((w1_previous-w1)**2 + (w2_previous-w2)**2)<=precision:
            break
    print(f'steps done: {i}')
    return w1, w2

In [158]:
w1, w2 = gradient_descent(X, y)
w1_reg, w2_reg = gradient_descent(X, y, C=10.0)

steps done: 243
steps done: 7


In [159]:
w1, w2

(0.2878116204717764, 0.09198330215925439)

In [160]:
w1_reg, w2_reg

(0.028558754546234223, 0.02478013724973556)

In [0]:
def gradient_probability(X: pd.DataFrame, w1, w2) -> pd.Series:
    return 1.0 / (1.0 + np.exp(-w1 * X[1] - w2 * X[2]))

In [0]:
y_pred = gradient_probability(X, w1, w2)
y_pred_l2 = gradient_probability(X, w1_reg, w2_reg)

In [163]:
y_pred - y_pred_l2

0     -0.045177
1      0.160710
2     -0.067753
3      0.170115
4      0.093438
         ...   
200    0.268910
201    0.169833
202   -0.094565
203   -0.330668
204    0.014642
Length: 205, dtype: float64

In [164]:
from sklearn.metrics import roc_auc_score

auc = roc_auc_score(y, y_pred)
auc_l2 = roc_auc_score(y, y_pred_l2)

print(f"{auc:.3f} {auc_l2:.3f}")

0.927 0.936


##Metrics

In [184]:
dataset_url = '/content/drive/My Drive/coursera/Week 3/week_3_classification.csv'
data = pd.read_csv(dataset_url,)
data

Unnamed: 0,true,pred
0,1,0
1,1,1
2,1,1
3,0,0
4,1,1
...,...,...
195,0,0
196,0,0
197,1,0
198,0,1


In [0]:
y = data['true']
y_pred = data['pred']

In [210]:
y.shape, y_pred.shape

((200,), (200,))

In [211]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_pred)

array([[64, 34],
       [59, 43]])

In [177]:
tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
print(tn, fp, fn, tp)

64 34 59 43


In [0]:
true_predictions = y[y==y_pred]
false_predictions = y[y!=y_pred]
# TP, FP, FN и TN
true_negatives = len(true_predictions) - sum(true_predictions)
true_positives = sum(true_predictions)

In [192]:
true_negatives, true_positives

(64, 43)

In [0]:
false_positives = len(false_predictions) - sum(false_predictions)
false_negatives = sum(false_predictions)

In [203]:
false_negatives, false_positives

(59, 34)

In [204]:
print(true_positives, false_positives, false_negatives, true_negatives)

43 34 59 64


In [212]:
accuracy_score(y, y_pred)

0.535

In [216]:
print(round(accuracy_score(y, y_pred), 2),
    round(precision_score(y, y_pred), 2),
    round(recall_score(y, y_pred),2),
    round(f1_score(y, y_pred), 2), )

0.54 0.56 0.42 0.48


##Metrics
####part 2

In [217]:
dataset_url = '/content/drive/My Drive/coursera/Week 3/week_3_scores.csv'
data = pd.read_csv(dataset_url,)
data

Unnamed: 0,true,score_logreg,score_svm,score_knn,score_tree
0,0,0.683832,0.145976,0.787063,0.500000
1,1,0.801966,0.239511,1.000000,0.833333
2,0,0.382315,-0.245701,0.000000,0.000000
3,1,0.506797,-0.137058,0.000000,0.105263
4,1,0.488781,-0.154148,0.000000,0.105263
...,...,...,...,...,...
195,0,0.573801,-0.088203,0.284192,0.400000
196,0,0.624422,-0.012315,0.205437,0.400000
197,1,0.425538,-0.135673,0.382351,0.700000
198,0,0.905270,0.583806,1.000000,1.000000


In [0]:
log_reg_score = data['score_logreg']
y = data['true']
svm_score = data['score_svm']
knn_score = data['score_knn']
tree_score = data['score_tree']


In [219]:
print(roc_auc_score(y, log_reg_score),
      roc_auc_score(y, svm_score),
      roc_auc_score(y, knn_score),
      roc_auc_score(y, tree_score),)    

0.719187675070028 0.7086834733893557 0.6351540616246498 0.6919267707082833


In [0]:
precision_recall_curve_columns = ["precision", "recall", "thresholds"]
log_reg_curve = pd.DataFrame(precision_recall_curve(y, log_reg_score)).T
log_reg_curve.columns = precision_recall_curve_columns
svm_curve = pd.DataFrame(precision_recall_curve(y, svm_score)).T
svm_curve.columns = precision_recall_curve_columns
knn_curve = pd.DataFrame(precision_recall_curve(y, knn_score)).T
knn_curve.columns = precision_recall_curve_columns
tree_curve = pd.DataFrame(precision_recall_curve(y, tree_score)).T
tree_curve.columns = precision_recall_curve_columns

In [239]:
print(log_reg_curve[log_reg_curve["recall"] > 0.7]["precision"].max(),
svm_curve[svm_curve["recall"] > 0.7]["precision"].max(),
knn_curve[knn_curve["recall"] > 0.7]["precision"].max(),
tree_curve[tree_curve["recall"] > 0.7]["precision"].max(),)

0.6302521008403361 0.6228070175438597 0.6065573770491803 0.6517857142857143
