In [2]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [3]:
# load data
df_test = pd.read_csv('Doceree-HCP-Test.csv')

In [4]:
df_test.isnull().sum()

PK                    0
DEVICETYPE            6
PLATFORM_ID          23
BIDREQUESTIP         23
USERPLATFORMUID      23
USERCITY           1617
USERZIPCODE        1173
USERAGENT            24
PLATFORMTYPE         23
CHANNELTYPE          23
URL                  23
KEYWORDS            129
dtype: int64

In [5]:
#impute missing values of numeric columns with mean and categorical columns with mode

cat_cols = df_test.select_dtypes(include='object').columns

for col in cat_cols:
    df_test[col] = df_test[col].fillna(df_test[col].mode()[0])

num_cols = df_test.select_dtypes(include=['int64','float64']).columns

for col in num_cols:
    df_test[col] = df_test[col].fillna(df_test[col].mean())


In [6]:
df_test.drop(['USERZIPCODE'], axis=1, inplace=True)
df_test.drop('CHANNELTYPE', axis=1, inplace=True)

In [7]:
#feature engineering on keyword column to extract keywords
# tokenize the keywords column where words are separated by '|'
df_test['KEYWORDS'] = df_test['KEYWORDS'].str.split('|')

# Create a set to store all keywords
all_keywords = set()

# Extract unique keywords and add them to the set
for keywords in df_test['KEYWORDS']:
    all_keywords.update(keywords)

# Create a column for each keyword and assign values
for keyword in all_keywords:
    df_test[keyword] = df_test['KEYWORDS'].apply(lambda x: int(keyword in x))

# Drop the 'KEYWORDS' column
df_test.drop(['KEYWORDS'], axis=1, inplace=True)

  df_test[keyword] = df_test['KEYWORDS'].apply(lambda x: int(keyword in x))


In [8]:
#encode categorical columns using label encoding

le = LabelEncoder()

cat_cols = df_test.select_dtypes(include='object').columns

for col in cat_cols:
    df_test[col] = le.fit_transform(df_test[col])

df_test.dtypes

PK                                 int32
DEVICETYPE                         int32
PLATFORM_ID                      float64
BIDREQUESTIP                       int32
USERPLATFORMUID                    int32
                                  ...   
Aortic Aneurysm                    int64
Anticoagulants                     int64
Amyotrophic Lateral Sclerosis      int64
chronic                            int64
small                              int64
Length: 1437, dtype: object

In [9]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [10]:

# Assuming 'X' is your feature matrix

# Standardize the features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_test)


In [11]:

# Perform PCA
pca = PCA(n_components=3)  # Specify the number of components to keep
X_pca = pca.fit_transform(X_scaled)



In [12]:


# Explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
print("Explained Variance Ratio:", explained_variance_ratio)

# Get the selected features based on the PCA transformation
selected_features = pca.components_
print("Selected Features:", selected_features)


Explained Variance Ratio: [0.04038412 0.02939575 0.02481501]
Selected Features: [[-4.70290510e-04 -1.67109820e-02  3.23381635e-02 ... -7.76923537e-03
   4.46612795e-02  7.51995396e-02]
 [ 1.55105391e-04  5.09998736e-04  1.00037290e-02 ... -2.29093632e-03
   4.05354560e-02  6.01607022e-02]
 [ 1.29970659e-03  6.98663323e-05 -8.61344695e-03 ...  1.73691957e-03
  -7.70125170e-03 -2.44741531e-02]]


In [8]:
#save the preprocessed test data
df = pd.DataFrame(data = X_pca, columns = ['PC1', 'PC2','PC3'])
df.to_csv('preprocessed_test.csv', index=False)

In [9]:
df_test.shape

(28553, 1437)