<a href="https://www.kaggle.com/code/rewa77/testing-with-meta-learning?scriptVersionId=187246322" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('/kaggle/input/deepfake/BALANCED-DATA.csv')
data.head()

In [None]:
data.shape

In [None]:
data.isnull().sum() #no null values

In [None]:
data.duplicated().sum()

In [None]:
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)

In [None]:
X = data.drop(columns = ['Fake'])
y = data['Fake']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =.2, random_state = 0)

# Ensemble Learning and GridSearch

In [None]:
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


xgb_clf = XGBClassifier(random_state=0)
lgbm_clf = LGBMClassifier(random_state=0)


voting_clf = VotingClassifier(estimators=[('xgb', xgb_clf), ('lgbm', lgbm_clf)], voting='soft')
voting_clf.fit(X_train, y_train)


y_train_pred = voting_clf.predict(X_train)
y_test_pred = voting_clf.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)


print("Voting Classifier Train Accuracy:", train_accuracy)
print("Voting Classifier Test Accuracy:", test_accuracy)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid_xgb = {
    'learning_rate': [0.1, 0.01],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300]
}

param_grid_lgbm = {
    'learning_rate': [0.1, 0.01],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300]
}

# grid search for XGBoost
grid_search_xgb = GridSearchCV(estimator=xgb_clf, param_grid=param_grid_xgb, scoring='accuracy', cv=5)
grid_search_xgb.fit(X_train, y_train)

# grid search for LightGBM
grid_search_lgbm = GridSearchCV(estimator=lgbm_clf, param_grid=param_grid_lgbm, scoring='accuracy', cv=5)
grid_search_lgbm.fit(X_train, y_train)

# Get best estimators
best_xgb_clf = grid_search_xgb.best_estimator_
best_lgbm_clf = grid_search_lgbm.best_estimator_

voting_clf_tuned = VotingClassifier(estimators=[('xgb', best_xgb_clf), ('lgbm', best_lgbm_clf)], voting='soft')
voting_clf_tuned.fit(X_train, y_train)

train_accuracy_tuned = accuracy_score(y_train, voting_clf_tuned.predict(X_train))
test_accuracy_tuned = accuracy_score(y_test, voting_clf_tuned.predict(X_test))
print("Tuned Voting Classifier Train Accuracy:", train_accuracy_tuned)
print("Tuned Voting Classifier Test Accuracy:", test_accuracy_tuned)


# Meta Learning

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

base_learners = [
    ('lr', LogisticRegression()),
    ('dt', DecisionTreeClassifier()),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')),
    ('lgbm', LGBMClassifier()),
    ('rf', RandomForestClassifier()),
    ('gb', GradientBoostingClassifier()),
    ('svc', SVC(probability=True))
]

ensemble_meta_learner = VotingClassifier(estimators=base_learners, voting='soft')
stacking_clf = StackingClassifier(estimators=base_learners, final_estimator=ensemble_meta_learner, cv=5)


stacking_clf.fit(X_train, y_train)
y_train_pred = stacking_clf.predict(X_train)

# Predict on the test data
y_test_pred = stacking_clf.predict(X_test)


train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)


print(f'Training Accuracy: {train_accuracy}')
print(f'Test Accuracy: {test_accuracy}')

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(['Training', 'Test'], [train_accuracy, test_accuracy], marker='o', linestyle='-', color='b', label='Accuracy')
plt.title('Training vs Test Accuracy')
plt.xlabel('Dataset')
plt.ylabel('Accuracy')
plt.ylim(0.8, 1.0)
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

# Predictions
y_pred_train = stacking_clf.predict(X_train)
y_pred_test = stacking_clf.predict(X_test)

# Confusion matrix for training set
plt.figure(figsize=(8, 6))
cm_train = confusion_matrix(y_train, y_pred_train)
sns.heatmap(cm_train, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix - Training Set')
plt.show()

# Confusion matrix for test set
plt.figure(figsize=(8, 6))
cm_test = confusion_matrix(y_test, y_pred_test)
sns.heatmap(cm_test, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix - Test Set')
plt.show()

# Classification report
print("Classification Report - Training Set:")
print(classification_report(y_train, y_pred_train))

print("Classification Report - Test Set:")
print(classification_report(y_test, y_pred_test))

In [None]:
import joblib

joblib.dump(stacking_clf, 'stacking_clf.pkl')

# Test with Arabic Real

In [None]:
AR = '/kaggle/input/deepfake/mariamAR.csv'
df = pd.read_csv(AR)

In [None]:
df.isnull().sum()

In [None]:
df['Fake'] = df['Fake'].fillna(0).astype(int)

In [None]:
df.duplicated().sum()

In [None]:
df.head()

In [None]:
X_new = df.drop('Fake', axis=1)
y_new = df['Fake']

In [None]:
stacking_clf = joblib.load('stacking_clf.pkl')

In [None]:
y_pred_new = stacking_clf.predict(X_new)

In [None]:
df['predictions'] = y_pred_new
df.to_csv('test_predictions.csv', index=False)

print("Predictions:")
print(y_pred_new)

In [None]:
from sklearn.metrics import accuracy_score

y_true = y_new 
accuracy = accuracy_score(y_true, y_pred_new)

# Print accuracy
print("Accuracy:", accuracy)

# Test with Real English

In [None]:
ENG = '/kaggle/input/deepfake/englishAudio.csv'
dff = pd.read_csv(ENG)

In [None]:
dff.isnull().sum()

In [None]:
dff['Fake'] = dff['Fake'].fillna(0).astype(int)

In [None]:
dff.duplicated().sum()

In [None]:
dff.head()

In [None]:
X_new2 = dff.drop('Fake', axis=1)
y_new2 = dff['Fake']

In [None]:
stacking_clf = joblib.load('stacking_clf.pkl')
y_pred_new2 = stacking_clf.predict(X_new2)

In [None]:
df['predictions'] = y_pred_new2
df.to_csv('test_predictions.csv', index=False)

print("Predictions:")
print(y_pred_new2)

In [None]:
from sklearn.metrics import accuracy_score

y_true = y_new2
accuracy = accuracy_score(y_true, y_pred_new2)

# Print accuracy
print("Accuracy:", accuracy)

# Testing with english language for someone, the model has seen before

In [None]:
test = '/kaggle/input/deepfake/mariam_to_aboENG.csv'
test1 = pd.read_csv(test)

In [None]:
test1['Fake'] = test1['Fake'].fillna(1).astype(int)

In [None]:
X_new3 = test1.drop('Fake', axis=1)
y_new3 = test1['Fake']

In [None]:
stacking_clf = joblib.load('stacking_clf.pkl')
y_pred_new3 = stacking_clf.predict(X_new3)

In [None]:
df['predictions'] = y_pred_new3
df.to_csv('test_predictions.csv', index=False)

print("Predictions:")
print(y_pred_new3)

In [None]:
from sklearn.metrics import accuracy_score

y_true = y_new3
accuracy = accuracy_score(y_true, y_pred_new3)

# Print accuracy
print("Accuracy:", accuracy)

# Test with new person with same language it is trained upon

In [None]:
test2 = '/kaggle/input/deepfake/mariam_to_margotAR.csv'
test2 = pd.read_csv(test2)

In [None]:
test2['Fake'] = test2['Fake'].fillna(1).astype(int)

In [None]:
X_new4 = test2.drop('Fake', axis=1)
y_new4 = test2['Fake']

In [None]:
stacking_clf = joblib.load('stacking_clf.pkl')
y_pred_new4 = stacking_clf.predict(X_new4)

In [None]:
df['predictions'] = y_pred_new4
df.to_csv('test_predictions.csv', index=False)

print("Predictions:")
print(y_pred_new4)

In [None]:
from sklearn.metrics import accuracy_score

y_true = y_new4
accuracy = accuracy_score(y_true, y_pred_new4)

# Print accuracy
print("Accuracy:", accuracy)

# Test with someone form the dataset with same language

In [None]:
test3 = '/kaggle/input/deepfake/mariam_to_nedalAR.csv'
test3 = pd.read_csv(test3)

In [None]:
test3['Fake'] = test3['Fake'].fillna(1).astype(int)

In [None]:
X_new5 = test3.drop('Fake', axis=1)
y_new5 = test3['Fake']

In [None]:
stacking_clf = joblib.load('stacking_clf.pkl')
y_pred_new5 = stacking_clf.predict(X_new5)

In [None]:
df['predictions'] = y_pred_new5
df.to_csv('test_predictions.csv', index=False)

print("Predictions:")
print(y_pred_new4)

In [None]:
from sklearn.metrics import accuracy_score

y_true = y_new5
accuracy = accuracy_score(y_true, y_pred_new5)

# Print accuracy
print("Accuracy:", accuracy)