### IMPORITNG LIBRARIES


In [448]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [449]:
df = pd.read_csv('Consumer_Dataset.csv')
df = df.drop(columns='Unnamed: 0')
df

Unnamed: 0,Gender,Age,Ever_Married,Family_Size,Profession,Graduated,Work_Experience,Energy_Consumption,Preferred_Renewable,Group
0,Male,22,No,4.0,Healthcare,No,1.0,Low,Hydro,D
1,Female,38,Yes,3.0,Engineer,Yes,,Average,Hydro,A
2,Female,67,Yes,1.0,Engineer,Yes,1.0,Low,Solar,B
3,Male,67,Yes,2.0,Lawyer,Yes,0.0,High,Solar,B
4,Female,40,Yes,6.0,Entertainment,Yes,,High,Solar,A
...,...,...,...,...,...,...,...,...,...,...
8063,Male,22,No,7.0,,No,0.0,Low,Tidal,D
8064,Male,35,No,4.0,Executive,No,3.0,Low,Hydro,D
8065,Female,33,No,1.0,Healthcare,Yes,1.0,Low,Solar,D
8066,Female,27,No,4.0,Healthcare,Yes,1.0,Low,Solar,B


In [450]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8068 entries, 0 to 8067
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Gender               8068 non-null   object 
 1   Age                  8068 non-null   int64  
 2   Ever_Married         7928 non-null   object 
 3   Family_Size          7733 non-null   float64
 4   Profession           7944 non-null   object 
 5   Graduated            7990 non-null   object 
 6   Work_Experience      7239 non-null   float64
 7   Energy_Consumption   8068 non-null   object 
 8   Preferred_Renewable  7992 non-null   object 
 9   Group                8068 non-null   object 
dtypes: float64(2), int64(1), object(7)
memory usage: 630.4+ KB


### TAKING CARE OF BLANK CELL


In [451]:
df = df.dropna(subset=['Ever_Married'])
df = df.dropna(subset=['Profession'])
df = df.dropna(subset=['Graduated'])
df = df.dropna(subset=['Preferred_Renewable'])


imputer = SimpleImputer(strategy='median')
df['Family_Size'] = imputer.fit_transform(df[['Family_Size']])
df['Work_Experience'] = imputer.fit_transform(df[['Work_Experience']])
df

Unnamed: 0,Gender,Age,Ever_Married,Family_Size,Profession,Graduated,Work_Experience,Energy_Consumption,Preferred_Renewable,Group
0,Male,22,No,4.0,Healthcare,No,1.0,Low,Hydro,D
1,Female,38,Yes,3.0,Engineer,Yes,1.0,Average,Hydro,A
2,Female,67,Yes,1.0,Engineer,Yes,1.0,Low,Solar,B
3,Male,67,Yes,2.0,Lawyer,Yes,0.0,High,Solar,B
4,Female,40,Yes,6.0,Entertainment,Yes,1.0,High,Solar,A
...,...,...,...,...,...,...,...,...,...,...
8062,Male,41,Yes,5.0,Artist,Yes,0.0,High,Solar,B
8064,Male,35,No,4.0,Executive,No,3.0,Low,Hydro,D
8065,Female,33,No,1.0,Healthcare,Yes,1.0,Low,Solar,D
8066,Female,27,No,4.0,Healthcare,Yes,1.0,Low,Solar,B


In [452]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7669 entries, 0 to 8067
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Gender               7669 non-null   object 
 1   Age                  7669 non-null   int64  
 2   Ever_Married         7669 non-null   object 
 3   Family_Size          7669 non-null   float64
 4   Profession           7669 non-null   object 
 5   Graduated            7669 non-null   object 
 6   Work_Experience      7669 non-null   float64
 7   Energy_Consumption   7669 non-null   object 
 8   Preferred_Renewable  7669 non-null   object 
 9   Group                7669 non-null   object 
dtypes: float64(2), int64(1), object(7)
memory usage: 659.1+ KB


### ENCODING


In [453]:
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df['Ever_Married'] = label_encoder.fit_transform(df['Ever_Married'])
df['Graduated'] = label_encoder.fit_transform(df['Graduated'])
df['Energy_Consumption'] = label_encoder.fit_transform(
    df['Energy_Consumption'])


label_features = ['Gender', 'Ever_Married', 'Graduated', 'Age',
                  'Family_Size', 'Work_Experience', 'Energy_Consumption']
onehot_features = ['Profession', 'Preferred_Renewable']


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), label_features),
        ('cat', OneHotEncoder(), onehot_features)
    ])

df_new = preprocessor.fit_transform(df)

df_new

array([[ 0.90476042, -1.18908053, -1.29816181, ...,  0.        ,
         0.        ,  0.        ],
       [-1.10526497,  0.84098594,  0.77031999, ...,  0.        ,
         0.        ,  0.        ],
       [-1.10526497,  0.84098594,  0.77031999, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [-1.10526497, -1.18908053,  0.77031999, ...,  1.        ,
         0.        ,  0.        ],
       [-1.10526497, -1.18908053,  0.77031999, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.90476042,  0.84098594,  0.77031999, ...,  0.        ,
         0.        ,  0.        ]])

In [454]:
num_features_names = label_features
cat_features_names = preprocessor.named_transformers_[
    'cat'].get_feature_names_out(onehot_features)
all_feature_names = np.concatenate((num_features_names, cat_features_names))


df_new = pd.DataFrame(df_new, columns=all_feature_names)
df_new

Unnamed: 0,Gender,Ever_Married,Graduated,Age,Family_Size,Work_Experience,Energy_Consumption,Profession_Artist,Profession_Doctor,Profession_Engineer,...,Profession_Homemaker,Profession_Lawyer,Profession_Marketing,Preferred_Renewable_Biomass,Preferred_Renewable_Geothermal,Preferred_Renewable_Hydro,Preferred_Renewable_Not Interested,Preferred_Renewable_Solar,Preferred_Renewable_Tidal,Preferred_Renewable_Wind
0,0.904760,-1.189081,-1.298162,-1.288844,0.765184,-0.450313,0.752916,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,-1.105265,0.840986,0.770320,-0.330085,0.097964,-0.450313,-1.605310,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-1.105265,0.840986,0.770320,1.407665,-1.236474,-0.450313,0.752916,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.904760,0.840986,0.770320,1.407665,-0.569255,-0.755982,-0.426197,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-1.105265,0.840986,0.770320,-0.210240,2.099622,-0.450313,-0.426197,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7664,0.904760,0.840986,0.770320,-0.150318,1.432403,-0.755982,-0.426197,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7665,0.904760,-1.189081,-1.298162,-0.509852,0.765184,0.161025,0.752916,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7666,-1.105265,-1.189081,0.770320,-0.629697,-1.236474,-0.450313,0.752916,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7667,-1.105265,-1.189081,0.770320,-0.989232,0.765184,-0.450313,0.752916,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [455]:
X = df_new
y = pd.get_dummies(df['Group']).values.astype(float)
y

array([[0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       ...,
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.]])

In [456]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=5)

model = Pipeline(
    steps=[('classifier', RandomForestClassifier(n_estimators=100, random_state=42))])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(y_pred)

[[0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 0.]
 ...
 [1. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 1. 0.]]


In [457]:
y_pred_labels = np.argmax(y_pred, axis=1)
y_test_labels = np.argmax(y_test, axis=1)

In [458]:
print("Accuracy:", accuracy_score(y_pred, y_test))

Accuracy: 0.3983050847457627


### TEST DATASET


In [459]:
df_test = pd.read_csv('Consumer Test Dataset.csv')

df_test = df_test.dropna(subset=['Ever_Married'])
df_test = df_test.dropna(subset=['Profession'])
df_test = df_test.dropna(subset=['Graduated'])
df_test = df_test.dropna(subset=['Preferred_Renewable'])


imputer = SimpleImputer(strategy='median')
df_test['Family_Size'] = imputer.fit_transform(df_test[['Family_Size']])
df_test['Work_Experience'] = imputer.fit_transform(
    df_test[['Work_Experience']])


label_encoder = LabelEncoder()
df_test['Gender'] = label_encoder.fit_transform(df_test['Gender'])
df_test['Ever_Married'] = label_encoder.fit_transform(df_test['Ever_Married'])
df_test['Graduated'] = label_encoder.fit_transform(df_test['Graduated'])
df_test['Energy_Consumption'] = label_encoder.fit_transform(
    df_test['Energy_Consumption'])


label_features_new = ['Gender', 'Ever_Married', 'Graduated',
                      'Age', 'Family_Size', 'Work_Experience', 'Energy_Consumption']
onehot_features_new = ['Profession', 'Preferred_Renewable']


preprocessor = ColumnTransformer(
    transformers=[('num', StandardScaler(), label_features_new), ('cat', OneHotEncoder(), onehot_features_new)])

df_test_new = preprocessor.fit_transform(df_test)


num_features_names = label_features_new
cat_features_names = preprocessor.named_transformers_[
    'cat'].get_feature_names_out(onehot_features_new)
all_feature_names = np.concatenate((num_features_names, cat_features_names))


df_test_new = pd.DataFrame(df_test_new, columns=all_feature_names)

y_pred = model.predict(df_test_new)
y_pred_labels = np.argmax(y_pred, axis=1)
df_test['Group'] = y_pred_labels
df_test

Unnamed: 0.1,Unnamed: 0,Gender,Age,Ever_Married,Family_Size,Profession,Graduated,Work_Experience,Energy_Consumption,Preferred_Renewable,Group
0,0,0,36,1,1.0,Engineer,1,0.0,2,Solar,0
1,1,1,37,1,4.0,Healthcare,1,8.0,0,Solar,0
3,3,1,59,1,2.0,Executive,0,11.0,1,Solar,2
4,4,0,19,0,4.0,Marketing,0,1.0,2,Solar,3
5,5,1,47,1,5.0,Doctor,1,0.0,1,Hydro,2
...,...,...,...,...,...,...,...,...,...,...,...
2622,2622,1,29,0,4.0,Healthcare,0,9.0,2,Solar,3
2623,2623,0,35,0,1.0,Doctor,1,1.0,2,Solar,1
2624,2624,0,53,0,2.0,Entertainment,1,1.0,2,Solar,3
2625,2625,1,47,1,5.0,Executive,1,1.0,1,Hydro,2


In [460]:
df_test['Group'] = df_test['Group'].replace({0: 'A', 1: 'B', 2: 'C', 3: 'D'})
df_test

Unnamed: 0.1,Unnamed: 0,Gender,Age,Ever_Married,Family_Size,Profession,Graduated,Work_Experience,Energy_Consumption,Preferred_Renewable,Group
0,0,0,36,1,1.0,Engineer,1,0.0,2,Solar,A
1,1,1,37,1,4.0,Healthcare,1,8.0,0,Solar,A
3,3,1,59,1,2.0,Executive,0,11.0,1,Solar,C
4,4,0,19,0,4.0,Marketing,0,1.0,2,Solar,D
5,5,1,47,1,5.0,Doctor,1,0.0,1,Hydro,C
...,...,...,...,...,...,...,...,...,...,...,...
2622,2622,1,29,0,4.0,Healthcare,0,9.0,2,Solar,D
2623,2623,0,35,0,1.0,Doctor,1,1.0,2,Solar,B
2624,2624,0,53,0,2.0,Entertainment,1,1.0,2,Solar,D
2625,2625,1,47,1,5.0,Executive,1,1.0,1,Hydro,C


In [None]:
df_test.to_csv('Modified_Consumer_Test_Data.csv', index=False)