<a href="https://colab.research.google.com/github/ruthu06/Pattern-Recognition-and-Data-Mining/blob/main/onlinedataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
df = pd.read_excel('/content/online_shoppers_data.xlsx')

def calculate_page_values_per_product_dur(row):
    if row['ProductRelated_Duration'] != 0:
        return row['PageValues'] / row['ProductRelated_Duration']
    else:
        return -1

def calculate_page_values_per_product_view(row):
    if row['ProductRelated_Duration'] != 0:
        return row['PageValues'] / row['ProductRelated']
    else:
        return -1

df['page_values_per_product_dur'] = df.apply(calculate_page_values_per_product_dur, axis=1)
df['page_values_per_product_view'] = df.apply(calculate_page_values_per_product_view, axis=1)
# Write the modified DataFrame back to the Excel file
df.to_excel('/content/online_shoppers_intention_with_calculations.xlsx', index=False)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_excel('/content/AUGMENTED_DATA_TRUE.xlsx')

df.drop(columns=[ 'Region', 'Browser', 'TrafficType'], inplace=True)
columns_to_encode = ['Month', 'VisitorType', 'Weekend','DurationPeriod','Seasons']

# Perform one-hot encoding for selected columns
df_encoded = pd.get_dummies(df, columns=columns_to_encode)

# Split the dataset into features (X) and target variable (y)
X = df_encoded.drop(columns=['Revenue'])
y = df['Revenue']

# Standardize numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)


# Split the dataset into training (70%), validation (15%), and test (15%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize and train a machine learning model (Random Forest Classifier in this example)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Validate the model using the validation set
y_val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", val_accuracy)

y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)

from sklearn.metrics import recall_score

# Assuming y_test and y_pred are your actual labels and predicted labels, respectively
# Calculate recall
recall = recall_score(y_test, y_test_pred)

print("Recall:", recall)


Validation Accuracy: 0.94293210888816
Test Accuracy: 0.9406557377049181
Recall: 0.9311010946555055


In [None]:
# Get feature importances from the trained model
feature_importances = model.feature_importances_

# Create a DataFrame to store feature importances along with corresponding column names
importance_df = pd.DataFrame({'Feature': df_encoded.drop(columns=['Revenue']).columns,
                              'Importance': feature_importances})

# Sort the DataFrame by importance values in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Print or visualize the top N most important features
top_n = 10  # Specify the number of top features you want to display
print("Top", top_n, "Most Important Features:")
print(importance_df.head(top_n))



Top 10 Most Important Features:
                           Feature  Importance
8                       PageValues    0.151336
14    page_values_per_product_view    0.117122
13     page_values_per_product_dur    0.084908
2976                   Weekend_0.0    0.065211
6                      BounceRates    0.060194
7                        ExitRates    0.053332
9                       SpecialDay    0.052474
3           Informational_Duration    0.051385
2                    Informational    0.046133
1          Administrative_Duration    0.035806


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid
param_grid = {
    'n_estimators': [80,100,120],
    'max_depth': [None, 10,12,15],
    'min_samples_split': [2, 5],
    'max_features': ['auto', 'sqrt']
}

# Create the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Evaluate on the test set
test_accuracy = grid_search.score(X_val, y_val)
print("Test Accuracy:", test_accuracy)


Best Hyperparameters: {'max_depth': 12, 'max_features': 'auto', 'min_samples_split': 5, 'n_estimators': 120}
Test Accuracy: 0.9422761561167596


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_excel('/content/AUGMENTED_DATA_TRUE.xlsx')

# Drop columns
df.drop(columns=['Month', 'Region', 'Browser', 'DurationPeriod', 'Seasons', 'TrafficType', 'VisitorType'], inplace=True)

# Split the dataset into features (X) and target variable (y)
X = df.drop(columns=['Revenue'])
y = df['Revenue']

# Standardize numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the dataset into training (70%), validation (15%), and test (15%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize and train a machine learning model (Gradient Boosting Machine in this example)
model = GradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train)

# Validate the model using the validation set
y_val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy (Gradient Boosting Machine):", val_accuracy)


Validation Accuracy (Gradient Boosting Machine): 0.9383404394883569


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

# Define the parameter grid
param_grid = {
    'n_estimators': [60,80, 100],
    'max_depth': [ 5,6, 7],  # Adjusted max_depth for Gradient Boosting
    'min_samples_split': [2,3, 5],
    'max_features': ['auto', 'sqrt']
}

# Create the Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(random_state=42)

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Evaluate on the validation set
val_accuracy = grid_search.best_estimator_.score(X_val, y_val)
print("Validation Accuracy (Gradient Boosting Machine):", val_accuracy)

Best Hyperparameters: {'max_depth': 6, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 100}
Validation Accuracy (Gradient Boosting Machine): 0.9419481797310594


In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize and train a machine learning model (Decision Tree Classifier in this example)
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Validate the model using the validation set
y_val_pred_dt = dt_model.predict(X_val)
val_accuracy_dt = accuracy_score(y_val, y_val_pred_dt)
print("Validation Accuracy (Decision Tree):", val_accuracy_dt)


Validation Accuracy (Decision Tree): 0.9130862577894392


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# Define the parameter grid for Decision Tree
param_grid_dt = {
    'max_depth': [5, 6, 7],
    'min_samples_split': [2, 3, 5],
    'max_features': ['auto', 'sqrt']
}

# Create the Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Perform GridSearchCV for Decision Tree
grid_search_dt = GridSearchCV(estimator=dt_classifier, param_grid=param_grid_dt, cv=5, scoring='accuracy')
grid_search_dt.fit(X_train, y_train)

# Print the best hyperparameters for Decision Tree
print("Best Hyperparameters (Decision Tree):", grid_search_dt.best_params_)

# Evaluate Decision Tree on the validation set
val_accuracy_dt = grid_search_dt.best_estimator_.score(X_val, y_val)
print("Validation Accuracy (Decision Tree):", val_accuracy_dt)


Best Hyperparameters (Decision Tree): {'max_depth': 7, 'max_features': 'auto', 'min_samples_split': 2}
Validation Accuracy (Decision Tree): 0.8934559221200649


In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

# Initialize individual classifiers
rf_classifier = RandomForestClassifier(random_state=42)
gb_classifier = GradientBoostingClassifier(random_state=42)
dt_classifier = DecisionTreeClassifier(random_state=42)

# Create the ensemble using VotingClassifier
ensemble_classifier = VotingClassifier(estimators=[
    ('rf', rf_classifier),
    ('gb', gb_classifier),
    ('dt', dt_classifier)
], voting='hard')  # 'hard' voting means majority voting

# Train the ensemble on the training data
ensemble_classifier.fit(X_train, y_train)

# Predict on the validation set
y_val_pred_ensemble = ensemble_classifier.predict(X_val)

# Calculate validation accuracy
val_accuracy_ensemble = accuracy_score(y_val, y_val_pred_ensemble)
print("Validation Accuracy (Ensemble):", val_accuracy_ensemble)


Validation Accuracy (Ensemble): 0.9406362741882585


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier

# Initialize individual classifiers
rf_classifier = RandomForestClassifier(random_state=42)
gb_classifier = GradientBoostingClassifier(random_state=42)
dt_classifier = DecisionTreeClassifier(random_state=42)

# Create the ensemble using VotingClassifier
ensemble_classifier = VotingClassifier(estimators=[
    ('rf', rf_classifier),
    ('gb', gb_classifier),
    ('dt', dt_classifier)
], voting='hard')  # 'hard' voting means majority voting

# Perform cross-validation
cv_scores = cross_val_score(ensemble_classifier, X_train, y_train, cv=5, scoring='accuracy')

# Print cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean Accuracy:", cv_scores.mean())

Cross-Validation Scores: [0.93572181 0.93183415 0.93042867 0.93780745 0.93640197]
Mean Accuracy: 0.9344388108353876


In [None]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# Initialize RandomOverSampler for oversampling
oversampler = RandomOverSampler(random_state=42)

# Fit and apply oversampling to the training data
X_train_resampled_over, y_train_resampled_over = oversampler.fit_resample(X_train, y_train)

# Initialize RandomUnderSampler for undersampling
undersampler = RandomUnderSampler(random_state=42)

# Fit and apply undersampling to the training data
X_train_resampled_under, y_train_resampled_under = undersampler.fit_resample(X_train, y_train)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

# Convert class weights to dictionary
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

# Initialize RandomForestClassifier with oversampling
rf_classifier_over = RandomForestClassifier(random_state=42)
rf_classifier_over.fit(X_train_resampled_over, y_train_resampled_over)

# Initialize RandomForestClassifier with undersampling
rf_classifier_under = RandomForestClassifier(random_state=42)
rf_classifier_under.fit(X_train_resampled_under, y_train_resampled_under)

# Initialize RandomForestClassifier with class weights
rf_classifier_weighted = RandomForestClassifier(random_state=42, class_weight=class_weight_dict)
rf_classifier_weighted.fit(X_train, y_train)


# Predict on the test set using classifiers trained with different techniques
y_pred_over = rf_classifier_over.predict(X_test)
y_pred_under = rf_classifier_under.predict(X_test)
y_pred_weighted = rf_classifier_weighted.predict(X_test)

# Calculate accuracy for each classifier
accuracy_over = accuracy_score(y_test, y_pred_over)
accuracy_under = accuracy_score(y_test, y_pred_under)
accuracy_weighted = accuracy_score(y_test, y_pred_weighted)

print("Accuracy (RandomForest with oversampling):", accuracy_over)
print("Accuracy (RandomForest with undersampling):", accuracy_under)
print("Accuracy (RandomForest with class weights):", accuracy_weighted)

Accuracy (RandomForest with oversampling): 0.8913513513513514
Accuracy (RandomForest with undersampling): 0.8491891891891892
Accuracy (RandomForest with class weights): 0.8902702702702703


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize GradientBoostingClassifier with oversampling
gb_classifier_over = GradientBoostingClassifier(random_state=42)
gb_classifier_over.fit(X_train_resampled_over, y_train_resampled_over)

# Initialize GradientBoostingClassifier with undersampling
gb_classifier_under = GradientBoostingClassifier(random_state=42)
gb_classifier_under.fit(X_train_resampled_under, y_train_resampled_under)

# Initialize GradientBoostingClassifier with class weights
gb_classifier_weighted = GradientBoostingClassifier(random_state=42)
gb_classifier_weighted.fit(X_train, y_train)

# Predict on the test set using GradientBoostingClassifier trained with different techniques
y_pred_over_gb = gb_classifier_over.predict(X_test)
y_pred_under_gb = gb_classifier_under.predict(X_test)
y_pred_weighted_gb = gb_classifier_weighted.predict(X_test)

# Calculate accuracy for each GradientBoostingClassifier
accuracy_over_gb = accuracy_score(y_test, y_pred_over_gb)
accuracy_under_gb = accuracy_score(y_test, y_pred_under_gb)
accuracy_weighted_gb = accuracy_score(y_test, y_pred_weighted_gb)

print("Accuracy (GradientBoosting with oversampling):", accuracy_over_gb)
print("Accuracy (GradientBoosting with undersampling):", accuracy_under_gb)
print("Accuracy (GradientBoosting with class weights):", accuracy_weighted_gb)

Accuracy (GradientBoosting with oversampling): 0.8594594594594595
Accuracy (GradientBoosting with undersampling): 0.86
Accuracy (GradientBoosting with class weights): 0.8967567567567568


In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize DecisionTreeClassifier with oversampling
dt_classifier_over = DecisionTreeClassifier(random_state=42)
dt_classifier_over.fit(X_train_resampled_over, y_train_resampled_over)

# Initialize DecisionTreeClassifier with undersampling
dt_classifier_under = DecisionTreeClassifier(random_state=42)
dt_classifier_under.fit(X_train_resampled_under, y_train_resampled_under)

# Initialize DecisionTreeClassifier with class weights
dt_classifier_weighted = DecisionTreeClassifier(random_state=42, class_weight=class_weight_dict)
dt_classifier_weighted.fit(X_train, y_train)

# Predict on the test set using DecisionTreeClassifier trained with different techniques
y_pred_over_dt = dt_classifier_over.predict(X_test)
y_pred_under_dt = dt_classifier_under.predict(X_test)
y_pred_weighted_dt = dt_classifier_weighted.predict(X_test)

# Calculate accuracy for each DecisionTreeClassifier
accuracy_over_dt = accuracy_score(y_test, y_pred_over_dt)
accuracy_under_dt = accuracy_score(y_test, y_pred_under_dt)
accuracy_weighted_dt = accuracy_score(y_test, y_pred_weighted_dt)

print("Accuracy (DecisionTree with oversampling):", accuracy_over_dt)
print("Accuracy (DecisionTree with undersampling):", accuracy_under_dt)
print("Accuracy (DecisionTree with class weights):", accuracy_weighted_dt)

Accuracy (DecisionTree with oversampling): 0.8551351351351352
Accuracy (DecisionTree with undersampling): 0.7924324324324324
Accuracy (DecisionTree with class weights): 0.8545945945945946


In [None]:
from sklearn.ensemble import VotingClassifier

# Initialize VotingClassifier with RandomForestClassifier, GradientBoostingClassifier, and DecisionTreeClassifier trained with different techniques
voting_classifier = VotingClassifier(estimators=[
    ('rf_over', rf_classifier_over),
    ('rf_under', rf_classifier_under),
    ('rf_weighted', rf_classifier_weighted),
    ('gb_over', gb_classifier_over),
    ('gb_under', gb_classifier_under),
    ('gb_weighted', gb_classifier_weighted),
    ('dt_over', dt_classifier_over),
    ('dt_under', dt_classifier_under),
    ('dt_weighted', dt_classifier_weighted)
], voting='soft')  # 'hard' voting means majority voting

# Train the VotingClassifier
voting_classifier.fit(X_train, y_train)

# Predict on the test set using the VotingClassifier
y_pred_voting = voting_classifier.predict(X_test)

# Calculate accuracy for the VotingClassifier
accuracy_voting = accuracy_score(y_test, y_pred_voting)

print("Accuracy (VotingClassifier):", accuracy_voting)

Accuracy (VotingClassifier): 0.8897297297297297


In [None]:
import pandas as pd
import numpy as np

# Load your dataset
df = pd.read_excel('/content/online_shoppers_intention_with_calculations (1).xlsx')  # Replace with the path to your Excel file

# Separate the dataset into true and false samples
true_samples = df[df['Revenue'] == True]
false_samples = df[df['Revenue'] == False]

num_additional_true_samples = 8000
augmented_true_samples = pd.DataFrame(columns=df.columns)

for _ in range(num_additional_true_samples):
    # Randomly select a true sample
    sample = true_samples.iloc[np.random.randint(0, len(true_samples))]

    # Augment the sample by adding noise to numerical columns (excluding 'Revenue' column)
    for col in df.select_dtypes(include=np.number).columns:
        if col != 'Revenue':
            noise = np.random.normal(scale=0.1)  # Adjust the scale of noise as needed
            sample[col] += noise

    augmented_true_samples = augmented_true_samples.append(sample, ignore_index=True)

augmented_df = pd.concat([true_samples, augmented_true_samples, false_samples], ignore_index=True)

# Save the augmented data to a new Excel file
augmented_df.to_excel('/content/online_shoppers_intention_with_calculations (24).xlsx', index=False)


In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

# Define the hyperparameters to search
param_grid = {
    'svm__kernel': ['linear', 'rbf', 'poly'],
    'svm__C': [0.1, 1, 10],
    'svm__gamma': [0.1, 1, 10]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best hyperparameters:", best_params)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy - SVM:", accuracy)