In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
df=pd.read_csv("Travel.csv")
df


import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Assuming df, num_cols, cat_cols, and imputer are already defined
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns
imputer = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), num_cols),
        ('cat', SimpleImputer(strategy='most_frequent'), cat_cols)
    ]
)

imputed_data = imputer.fit_transform(df)

# Convert imputed_data to DataFrame
df_imputed = pd.DataFrame(imputed_data, columns=num_cols.tolist() + cat_cols.tolist())

# Check for missing values
print("After handling missing values:\n",df_imputed.isnull().sum())

# Replace 'Fe Male' with 'Female'
df['Gender'] = df['Gender'].replace('Fe Male', 'Female')
from sklearn.preprocessing import LabelEncoder

# Select categorical columns
cat_cols = ['TypeofContact','ProdTaken','Occupation', 'Gender', 'MaritalStatus', 'Designation']

# Initialize LabelEncoder
encoder = LabelEncoder()

# Encode categorical columns
for col in cat_cols:
    df_imputed[col] = encoder.fit_transform(df_imputed[col])

# Encode categorical columns and print value counts
for col in cat_cols:
    df_imputed[col] = encoder.fit_transform(df_imputed[col])
    print(f"Value counts for {col}:")
    print(df_imputed[col].value_counts())
    print("\n")


# Separate the dataset into two subsets based on 'ProdTaken' values
prod_taken_0 = df_imputed[df_imputed['ProdTaken'] == 0]
prod_taken_1 = df_imputed[df_imputed['ProdTaken'] == 1]
# Save prod_taken_1_copy as another CSV file
prod_taken_1.to_csv('prod1.csv', index=False)
prod_taken_0.to_csv('prod0.csv', index=False)


import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df1 = pd.read_csv('prod1.csv')

# Define features (X) and target variable (y)
features_1 = df1.drop('ProductPitched', axis=1).columns.tolist()  # Drop the target variable and extract column names as a list
target_variable_1 = 'ProductPitched'

# Extract features (X) and target variable (y)
x = df1[features_1]
y = df1[target_variable_1]

# Display the first few rows of X and y
print("Feature Matrix (X):")
print(x.head())
print("\nTarget Variable (y):")
print(y.head())
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Assuming X and y are already defined and contain categorical features and target variable respectively

# Initialize SelectKBest with chi2 scoring function
selector = SelectKBest(score_func=chi2, k=10)  # Select top 5 features, adjust k as needed

# Fit selector to data
selector.fit(x, y)

# Get selected feature indices
selected_feature_indices = selector.get_support(indices=True)

# Get selected feature names
selected_features = x.columns[selected_feature_indices]

# Display selected features
print("Selected Features:")
print(selected_features)
x=df1[selected_features]

After handling missing values:
 CustomerID                  0
ProdTaken                   0
Age                         0
CityTier                    0
DurationOfPitch             0
NumberOfPersonVisiting      0
NumberOfFollowups           0
PreferredPropertyStar       0
NumberOfTrips               0
Passport                    0
PitchSatisfactionScore      0
OwnCar                      0
NumberOfChildrenVisiting    0
MonthlyIncome               0
TypeofContact               0
Occupation                  0
Gender                      0
ProductPitched              0
MaritalStatus               0
Designation                 0
dtype: int64
Value counts for TypeofContact:
TypeofContact
1    3469
0    1419
Name: count, dtype: int64


Value counts for ProdTaken:
ProdTaken
0    3968
1     920
Name: count, dtype: int64


Value counts for Occupation:
Occupation
2    2368
3    2084
1     434
0       2
Name: count, dtype: int64


Value counts for Gender:
Gender
2    2916
1    1817
0     155
Name:

In [2]:

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

# Display the shape of the resulting datasets
print("\nTraining feature set shape:", X_train.shape)
print("Validation feature set shape:", X_val.shape)
print("Training target set shape:", y_train.shape)
print("Validation target set shape:", y_val.shape)


Training feature set shape: (736, 10)
Validation feature set shape: (184, 10)
Training target set shape: (736,)
Validation target set shape: (184,)


In [3]:


from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and validation data
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Display the shape of the scaled datasets
print("\nScaled Training feature set shape:", X_train_scaled.shape)
print("Scaled Validation feature set shape:", X_val_scaled.shape)



Scaled Training feature set shape: (736, 10)
Scaled Validation feature set shape: (184, 10)


In [4]:
# Check the distribution of the target variable in the training and validation sets
print("Training set class distribution:")
print(y_train.value_counts())

print("\nValidation set class distribution:")
print(y_val.value_counts())


Training set class distribution:
ProductPitched
Basic           449
Deluxe          162
Standard         93
Super Deluxe     17
King             15
Name: count, dtype: int64

Validation set class distribution:
ProductPitched
Basic           103
Deluxe           42
Standard         31
King              5
Super Deluxe      3
Name: count, dtype: int64


In [5]:
from imblearn.over_sampling import SMOTE
from collections import Counter
class_counts = Counter(y_train)
majority_class = max(class_counts, key=class_counts.get)
print(majority_class)
minority_classes = [cls for cls, count in class_counts.items() if count < class_counts[majority_class]]
print(minority_classes)
X_resampled, y_resampled = X_train, y_train
for cls in minority_classes:
    smote = SMOTE(sampling_strategy={cls: class_counts[majority_class]}, random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_resampled, y_resampled)
print(f'Original class distribution: {Counter(y_train)}')
print(f'Resampled class distribution: {Counter(y_resampled)}')


Basic
['Standard', 'King', 'Deluxe', 'Super Deluxe']
Original class distribution: Counter({'Basic': 449, 'Deluxe': 162, 'Standard': 93, 'Super Deluxe': 17, 'King': 15})
Resampled class distribution: Counter({'Basic': 449, 'Standard': 449, 'King': 449, 'Deluxe': 449, 'Super Deluxe': 449})


In [6]:
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

# Assuming X_train and y_train are already defined
class_counts = Counter(y_train)
majority_class = max(class_counts, key=class_counts.get)
print(f"Majority class: {majority_class}")
minority_classes = [cls for cls, count in class_counts.items() if count < class_counts[majority_class]]
print(f"Minority classes: {minority_classes}")

X_resampled, y_resampled = X_train, y_train
for cls in minority_classes:
    smote = SMOTE(sampling_strategy={cls: class_counts[majority_class]}, random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_resampled, y_resampled)

print(f'Original class distribution: {Counter(y_train)}')
print(f'Resampled class distribution: {Counter(y_resampled)}')

# Ensure resampled data has consistent lengths
print(f'Resampled X shape: {X_resampled.shape}')
print(f'Resampled y shape: {y_resampled.shape}')

# Scale the resampled data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_resampled)
X_val_scaled = scaler.transform(X_val)

# Define the models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'GBM': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'kNN': KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors
}

# Train and evaluate each model
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_scaled, y_resampled)
    print("Training complete.")

    # Make predictions
    y_pred_train = model.predict(X_train_scaled)
    y_pred_val = model.predict(X_val_scaled)

    # Calculate accuracy
    train_accuracy = accuracy_score(y_resampled, y_pred_train)
    val_accuracy = accuracy_score(y_val, y_pred_val)

    # Print results
    print(f"\n{name}:")
    print(f"Training Accuracy: {train_accuracy:.2f}")
    print(f"Validation Accuracy: {val_accuracy:.2f}")
    print("Classification Report on Validation Data:")
    print(classification_report(y_val, y_pred_val))

    # Print actual vs predicted
    print("Actual vs Predicted:")
    if isinstance(y_val, pd.Series):
        for i in range(len(y_val)):
            print(f"Actual: {y_val.iloc[i]}, Predicted: {y_pred_val[i]}")
    elif isinstance(y_val, np.ndarray):
        for i in range(len(y_val)):
            print(f"Actual: {y_val[i]}, Predicted: {y_pred_val[i]}")

    # Perform cross-validation
    cv_scores = cross_val_score(model, X_train_scaled, y_resampled, cv=5)
    print(f"Cross-Validation Accuracy (5-fold): {cv_scores.mean():.2f} (+/- {cv_scores.std() * 2:.2f})\n")


Majority class: Basic
Minority classes: ['Standard', 'King', 'Deluxe', 'Super Deluxe']
Original class distribution: Counter({'Basic': 449, 'Deluxe': 162, 'Standard': 93, 'Super Deluxe': 17, 'King': 15})
Resampled class distribution: Counter({'Basic': 449, 'Standard': 449, 'King': 449, 'Deluxe': 449, 'Super Deluxe': 449})
Resampled X shape: (2245, 10)
Resampled y shape: (2245,)
Training Random Forest...
Training complete.

Random Forest:
Training Accuracy: 1.00
Validation Accuracy: 1.00
Classification Report on Validation Data:
              precision    recall  f1-score   support

       Basic       1.00      1.00      1.00       103
      Deluxe       1.00      1.00      1.00        42
        King       1.00      1.00      1.00         5
    Standard       1.00      1.00      1.00        31
Super Deluxe       1.00      1.00      1.00         3

    accuracy                           1.00       184
   macro avg       1.00      1.00      1.00       184
weighted avg       1.00      1.00

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from collections import Counter

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Perform grid search
grid_search.fit(X_train_scaled, y_resampled)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)


Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best Score: 1.0


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize Random Forest classifier with the best parameters
best_params = {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
best_rf_classifier = RandomForestClassifier(**best_params, random_state=42)

# Train the tuned model on the training data
best_rf_classifier.fit(X_train_scaled, y_train)

# Make predictions on the training data
y_pred_train = best_rf_classifier.predict(X_train_scaled)

# Make predictions on the validation data
y_pred_val = best_rf_classifier.predict(X_val_scaled)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_pred_train)
val_accuracy = accuracy_score(y_val, y_pred_val)

# Print the training accuracy
print("Training Accuracy:", train_accuracy)

# Print the validation accuracy and classification report
print("\nValidation Accuracy:", val_accuracy)
print("\nClassification Report on Validation Data:")
print(classification_report(y_val, y_pred_val))


ValueError: Found input variables with inconsistent numbers of samples: [2245, 736]