<a href="https://colab.research.google.com/github/simnpeter/2022.Oop/blob/master/Low_level_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import pickle
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path='/content/drive/My Drive/Colab Notebooks/dataset.csv'

def custom_parser(line):
    parts = line.split(";")  # Felosztjuk a sort pontosvessző mentén
    # Az első elem az összes többi elem
    username = parts[0] if len(parts) == 1 else ";".join(parts[:-1])
    label = int(parts[-1])  # Utolsó elem a címke, ami számmá konvertálható
    return username, label

data = []
with open(path, "r") as file:
  for line in file:
        username, label = custom_parser(line.strip())
        data.append((username, label))

        # Az adatok pandas DataFrame-be helyezése
df = pd.DataFrame(data, columns=["Username", "Label"])

label_counts = df['Label'].value_counts()

grouped = df.groupby('Label')

balanced_df = grouped.apply(lambda x: x.sample(n=label_counts.min()))

balanced_df = balanced_df.reset_index(drop=True)

# 2. Felosztás X és y változókra
X = df['Username']
y = df['Label']

X_balanced = balanced_df['Username']
y_balanced = balanced_df['Label']


vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)
X_balanced = vectorizer.fit_transform(X_balanced)


# 3. Tanító- és tesztadatokra felosztás
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_balanced_train, X_balanced_test, y_balanced_train, y_balanced_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)



In [None]:
mlp_classifier = MLPClassifier()

# Define the grid of hyperparameters to try out
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100)],
    'activation': ['relu', 'tanh'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.001, 0.01],
}

# Create the GridSearchCV model
grid_search = GridSearchCV(estimator=mlp_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# Train the model and search for the best hyperparameters
grid_search.fit(X, y)

# Get the best model and best hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best model:", best_model)
print("Best parameters:", best_params)
print("Best accuracy score:", best_score)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")

    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

In [None]:
svm_classifier = SVC(C=0.1, kernel='linear')
svm_classifier.fit(X_train, y_train)

# 4. SVM modell kiértékelése

print_score(svm_classifier, X_train, y_train, X_test, y_test, train=True)
print_score(svm_classifier, X_train, y_train, X_test, y_test, train=False)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train Result:
Accuracy Score: 89.74%
_______________________________________________
CLASSIFICATION REPORT:
                0            1  accuracy     macro avg  weighted avg
precision     0.0     0.897446  0.897446      0.448723      0.805409
recall        0.0     1.000000  0.897446      0.500000      0.897446
f1-score      0.0     0.945952  0.897446      0.472976      0.848940
support    1044.0  9136.000000  0.897446  10180.000000  10180.000000
_______________________________________________
Confusion Matrix: 
 [[   0 1044]
 [   0 9136]]

Test Result:
Accuracy Score: 89.98%
_______________________________________________
CLASSIFICATION REPORT:
               0            1  accuracy    macro avg  weighted avg
precision    0.0     0.899843  0.899843     0.449921      0.809717
recall       0.0     1.000000  0.899843     0.500000      0.899843
f1-score     0.0     0.947281  0.899843     0.473641      0.852404
support    255.0  2291.000000  0.899843  2546.000000   2546.000000
_________

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
svm_balanced_classifier = SVC(C=0.1, kernel='linear')
svm_balanced_classifier.fit(X_balanced_train, y_balanced_train)

# 4. SVM modell kiértékelése

print_score(svm_balanced_classifier, X_balanced_train, y_balanced_train, X_balanced_test, y_balanced_test, train=True)
print_score(svm_balanced_classifier, X_balanced_train, y_balanced_train, X_balanced_test, y_balanced_test, train=False)

In [None]:
rf_classifier = RandomForestClassifier(max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100)
rf_classifier.fit(X_train, y_train)

print_score(rf_classifier, X_train, y_train, X_test, y_test, train=True)
print_score(rf_classifier, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 89.74%
_______________________________________________
CLASSIFICATION REPORT:
                0            1  accuracy     macro avg  weighted avg
precision     0.0     0.897446  0.897446      0.448723      0.805409
recall        0.0     1.000000  0.897446      0.500000      0.897446
f1-score      0.0     0.945952  0.897446      0.472976      0.848940
support    1044.0  9136.000000  0.897446  10180.000000  10180.000000
_______________________________________________
Confusion Matrix: 
 [[   0 1044]
 [   0 9136]]

Test Result:
Accuracy Score: 89.98%
_______________________________________________
CLASSIFICATION REPORT:
               0            1  accuracy    macro avg  weighted avg
precision    0.0     0.899843  0.899843     0.449921      0.809717
recall       0.0     1.000000  0.899843     0.500000      0.899843
f1-score     0.0     0.947281  0.899843     0.473641      0.852404
support    255.0  2291.000000  0.899843  2546.000000   2546.000000
_________

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
rf_balanced_classifier = RandomForestClassifier(max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100)
rf_balanced_classifier.fit(X_balanced_train, y_balanced_train)

print_score(rf_balanced_classifier, X_balanced_train, y_balanced_train, X_balanced_test, y_balanced_test, train=True)
print_score(rf_balanced_classifier, X_balanced_train, y_balanced_train, X_balanced_test, y_balanced_test, train=False)

In [None]:
mlp_classifier = MLPClassifier()
mlp_classifier.fit(X_train, y_train)

print_score(rf_classifier, X_train, y_train, X_test, y_test, train=True)
print_score(rf_classifier, X_train, y_train, X_test, y_test, train=False)

Best model: SVC(C=0.1, kernel='linear')
Best parameters: {'C': 0.1, 'kernel': 'linear'}
Best accuracy score: 0.897925510489122


In [None]:
mlp_balanced_classifier = MLPClassifier()
mlp_balanced_classifier.fit(X_balanced_train, y_balanced_train)

print_score(rf_balanced_classifier, X_balanced_train, y_balanced_train, X_balanced_test, y_balanced_test, train=True)
print_score(rf_balanced_classifier, X_balanced_train, y_balanced_train, X_balanced_test, y_balanced_test, train=False)