In [1]:
                                                  #Task 2
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity



In [2]:
# Example URLs (ensure these point to correct datasets)
customers_url = 'https://drive.google.com/uc?id=1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE'

# Load dataset
customers = pd.read_csv(customers_url)

# Display the first few rows to understand the dataset
print("Customers DataFrame:\n", customers.head())

# Ensure 'CustomerID' exists before dropping
if 'CustomerID' in customers.columns:
    # Exclude non-numerical and irrelevant columns
    numerical_columns = customers.select_dtypes(include=['float64', 'int64']).columns.tolist()
    print("\nNumerical columns:", numerical_columns)

    if numerical_columns:
        # Scale only numerical columns
        scaler = StandardScaler()
        customer_features = scaler.fit_transform(customers[numerical_columns])

        # Step 4: Calculate the cosine similarity matrix
        similarity_matrix = cosine_similarity(customer_features)
        print("\nCosine Similarity Matrix:\n", similarity_matrix)
    else:
        print("No numerical columns found for scaling.")
else:
    print("'CustomerID' column is missing or incorrectly specified.")


Customers DataFrame:
   CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15

Numerical columns: []
No numerical columns found for scaling.


In [3]:
import requests
from io import StringIO
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Define function to download CSV from Google Drive
def download_csv_from_google_drive(url):
    try:
        file_id = url.split('/d/')[1].split('/')[0]
        download_url = f"https://drive.google.com/uc?id={file_id}"
        response = requests.get(download_url)
        response.raise_for_status()  # Check if the request was successful
        return StringIO(response.text)
    except Exception as e:
        print(f"Error downloading CSV: {e}")
        return None

# Step 2: Load Data from Google Drive URLs
customers_url = 'https://drive.google.com/file/d/1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE/view?usp=sharing'  # Example URL for Customers.csv
products_url = 'https://drive.google.com/file/d/1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0/view?usp=sharing'  # Example URL for Products.csv

customers_data = download_csv_from_google_drive(customers_url)
products_data = download_csv_from_google_drive(products_url)

if customers_data is None or products_data is None:
    print("Error loading the data.")
else:
    # Step 3: Load the CSV data into pandas DataFrames
    customers = pd.read_csv(customers_data)
    products = pd.read_csv(products_data)

    # Step 4: Check for missing values in customer data
    print("Checking for missing values in customer data...")
    print(customers.isna().sum())

    # Step 5: Feature Engineering - Exclude non-numeric columns
    # Identify columns that are numeric
    numeric_columns = customers.select_dtypes(include=['number']).columns

    # For categorical features, use one-hot encoding (if relevant)
    categorical_columns = customers.select_dtypes(exclude=['number']).columns

    # For categorical features, use one-hot encoding (if relevant)
    encoder = OneHotEncoder(sparse_output=False)  # Updated argument here
    encoded_categorical_data = encoder.fit_transform(customers[categorical_columns])

    # Combine the numeric features and the encoded categorical data
    numeric_data = customers[numeric_columns].values
    customer_features = pd.concat([pd.DataFrame(numeric_data), pd.DataFrame(encoded_categorical_data)], axis=1)

    # Step 6: Normalize the features
    scaler = StandardScaler()
    customer_features_scaled = scaler.fit_transform(customer_features)

    # Step 7: Calculate Similarity
    similarity_matrix = cosine_similarity(customer_features_scaled)

    # Step 8: Generate Lookalikes
    lookalikes = {}
    for i, cust_id in enumerate(customers['CustomerID'][:20]):  # First 20 customers
        similar_indices = similarity_matrix[i].argsort()[::-1][1:4]  # Top 3 similar excluding self
        similar_scores = similarity_matrix[i][similar_indices]
        lookalikes[cust_id] = [(customers['CustomerID'][j], round(score, 2)) for j, score in zip(similar_indices, similar_scores)]

    # Step 9: Debugging - Print the lookalikes dictionary
    print("Lookalikes generated:")
    print(lookalikes)

    # Step 10: Save Lookalike.csv
    try:
        lookalike_df = pd.DataFrame({
            'cust_id': lookalikes.keys(),
            'lookalikes': [str(v) for v in lookalikes.values()]
        })
        lookalike_df.to_csv('Lookalike.csv', index=False)
        print("Output saved as Lookalike.csv")
    except Exception as e:
        print(f"Error saving the file: {e}")


Checking for missing values in customer data...
CustomerID      0
CustomerName    0
Region          0
SignupDate      0
dtype: int64
Lookalikes generated:
{'C0001': [('C0151', np.float64(0.0)), ('C0126', np.float64(0.0)), ('C0191', np.float64(0.0))], 'C0002': [('C0166', np.float64(0.19)), ('C0175', np.float64(0.0)), ('C0115', np.float64(0.0))], 'C0003': [('C0151', np.float64(0.0)), ('C0126', np.float64(0.0)), ('C0191', np.float64(0.0))], 'C0004': [('C0191', np.float64(0.0)), ('C0151', np.float64(0.0)), ('C0126', np.float64(0.0))], 'C0005': [('C0115', np.float64(0.0)), ('C0175', np.float64(0.0)), ('C0002', np.float64(0.0))], 'C0006': [('C0151', np.float64(0.0)), ('C0126', np.float64(0.0)), ('C0191', np.float64(0.0))], 'C0007': [('C0053', np.float64(0.19)), ('C0115', np.float64(0.0)), ('C0175', np.float64(0.0))], 'C0008': [('C0055', np.float64(0.0)), ('C0038', np.float64(0.0)), ('C0029', np.float64(0.0))], 'C0009': [('C0105', np.float64(0.0)), ('C0111', np.float64(0.0)), ('C0063', np.flo

In [4]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5)
customer_clusters = kmeans.fit_predict(customer_features_scaled)
customers['Cluster'] = customer_clusters
print(customers[['CustomerID', 'Cluster']].head())
print(customers['Cluster'].value_counts())


  CustomerID  Cluster
0      C0001        3
1      C0002        3
2      C0003        3
3      C0004        3
4      C0005        3
Cluster
3    195
2      2
4      1
1      1
0      1
Name: count, dtype: int64
