In [2]:
import pandas as pd

# Load the datasets
japanese_data = pd.read_excel('C:/Users/DELL/Documents/AI/excel file/project1.xlsx')
indian_data = pd.read_excel('C:/Users/DELL/Documents/AI/excel file/prjdataset1.xlsx')

# Display the first few rows of each dataset
print("Japanese Dataset:")
print(japanese_data.head())
print("\nIndian Dataset:")
print(indian_data.head())


Japanese Dataset:
           ID  CURR_AGE GENDER  ANN_INCOME  AGE_CAR  PURCHASE
0  00001Q15YJ        50      M      445344      439         0
1  00003I71CQ        35      M      107634      283         0
2  00003N47FS        59      F      502787      390         1
3  00005H41DE        43      M      585664      475         0
4  00007E17UM        39      F      705723      497         1

Indian Dataset:
           ID  CURR_AGE GENDER  ANN_INCOME             DT_MAINT
0  20710B05XL        54      M     1425390            4/20/2018
1  89602T51HX        47      M     1678954  2018-08-06 00:00:00
2  70190Z52IP        60      M      931624            7/31/2017
3  25623V15MU        55      F     1106320            7/31/2017
4  36230I68CE        32      F      748465            1/27/2019


In [3]:
# Identify numeric and categorical columns
numeric_cols_indian = indian_data.select_dtypes(include=['number']).columns
categorical_cols_indian = indian_data.select_dtypes(include=['object']).columns

numeric_cols_japanese = japanese_data.select_dtypes(include=['number']).columns
categorical_cols_japanese = japanese_data.select_dtypes(include=['object']).columns

print("Numeric columns in Indian dataset:", numeric_cols_indian)
print("Categorical columns in Indian dataset:", categorical_cols_indian)
print("Numeric columns in Japanese dataset:", numeric_cols_japanese)
print("Categorical columns in Japanese dataset:", categorical_cols_japanese)

Numeric columns in Indian dataset: Index(['CURR_AGE', 'ANN_INCOME'], dtype='object')
Categorical columns in Indian dataset: Index(['ID', 'GENDER', 'DT_MAINT'], dtype='object')
Numeric columns in Japanese dataset: Index(['CURR_AGE', 'ANN_INCOME', 'AGE_CAR', 'PURCHASE'], dtype='object')
Categorical columns in Japanese dataset: Index(['ID', 'GENDER'], dtype='object')


In [10]:
print(japanese_data.columns)


Index(['ID', 'CURR_AGE', 'GENDER', 'ANN_INCOME', 'AGE_CAR', 'PURCHASE'], dtype='object')


In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

# Sample data
data = pd.read_excel('C:/Users/DELL/Documents/AI/excel file/project1.xlsx')
japanese_data = pd.DataFrame(data)

# Check the columns in the DataFrame
print("Columns in the DataFrame:", japanese_data.columns)

# Ensure 'PURCHASE' column exists
if 'PURCHASE' not in japanese_data.columns:
    raise ValueError("The column 'PURCHASE' is not found in the DataFrame")

# Define feature and target variables
X = japanese_data[['CURR_AGE', 'ANN_INCOME', 'AGE_CAR', 'GENDER']]
y = japanese_data['PURCHASE']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the preprocessing pipeline
numeric_features = ['CURR_AGE', 'ANN_INCOME', 'AGE_CAR']
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_features = ['GENDER']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply the preprocessing pipeline
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

print("Preprocessing completed successfully.")


Columns in the DataFrame: Index(['ID', 'CURR_AGE', 'GENDER', 'ANN_INCOME', 'AGE_CAR', 'PURCHASE'], dtype='object')
Preprocessing completed successfully.


In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Evaluate the model
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))


              precision    recall  f1-score   support

           0       0.64      0.61      0.62      2964
           1       0.71      0.73      0.72      3856

    accuracy                           0.68      6820
   macro avg       0.67      0.67      0.67      6820
weighted avg       0.68      0.68      0.68      6820

[[1804 1160]
 [1031 2825]]
ROC-AUC Score: 0.7470056042496123


In [38]:
import pandas as pd
from sklearn.pipeline import Pipeline

# Sample data (to be replaced with your actual Indian dataset)
indian_data =  pd.read_excel('C:/Users/DELL/Documents/AI/excel file/INDIAN.xlsx')

# Drop the target column from the Indian dataset to get feature columns
indian_X = indian_data[['CURR_AGE', 'ANN_INCOME','DT_MAINT','GENDER','AGE_CAR']]

# Apply the preprocessing pipeline to the Indian dataset
indian_X_transformed = preprocessor.transform(indian_X)

# Assuming 'model' is already trained
indian_y_pred = model.predict(indian_X_transformed)

# Estimate the number of potential customers in the Indian market
potential_customers = sum(indian_y_pred)

print("\nEstimated number of potential customers in the Indian market:", potential_customers)



Estimated number of potential customers in the Indian market: 65855


In [39]:
import pandas as pd


indian_data_processed = pd.DataFrame(indian_X_transformed, columns=preprocessor.get_feature_names_out())
japanese_data_processed = pd.DataFrame(X_test, columns=preprocessor.get_feature_names_out())

# Add the PURCHASE column
indian_data_processed['PURCHASE'] = indian_y_pred
japanese_data_processed['PURCHASE'] = y_test.values  # Ensure PURCHASE is aligned with the test set

# Save to Excel files
indian_data_processed.to_excel('indian_data_processed.xlsx', index=False)
japanese_data_processed.to_excel('japanese_data_processed.xlsx', index=False)

print("Data saved to Excel files successfully.")


Data saved to Excel files successfully.
