In [4]:
import pandas as pd
import numpy as np


np.random.seed(42)
n_samples = 100000



ages = np.random.randint(18, 60, size=n_samples)
genders = np.random.choice(['Female', 'Male'], size=n_samples)
hydration_levels = np.random.choice(['Low', 'Medium', 'High'], size=n_samples)
oil_levels = np.random.choice(['Low', 'Medium', 'High'], size=n_samples)
sensitivity_levels = np.random.choice(['Low', 'Medium', 'High'], size=n_samples)
humidity_levels = np.random.uniform(20, 80, size=n_samples).round(1)
temperature_levels = np.random.uniform(10, 35, size=n_samples).round(1)


data = {
    'Age': ages,
    'Gender': genders,
    'Hydration_Level': hydration_levels,
    'Oil_Level': oil_levels,
    'Sensitivity': sensitivity_levels,
    'Humidity': humidity_levels,
    'Temperature': temperature_levels
}

df = pd.DataFrame(data)


def determine_skin_type_refined(row):
    if row['Hydration_Level'] in ['Medium', 'High'] and \
       row['Oil_Level'] == 'Medium' and \
       row['Sensitivity'] in ['Low', 'Medium'] and \
       35 <= row['Humidity'] <= 65 and \
       15 <= row['Temperature'] <= 25:
        return 'Normal'
    
    if row['Hydration_Level'] == 'Low' and \
       row['Oil_Level'] == 'Low' and \
       row['Sensitivity'] == 'High' and \
       20 <= row['Humidity'] <= 45 and \
       10 <= row['Temperature'] <= 20:
        return 'Dry'
    
    if row['Hydration_Level'] in ['Low', 'Medium'] and \
       row['Oil_Level'] == 'High' and \
       row['Sensitivity'] in ['Low', 'Medium'] and \
       45 <= row['Humidity'] <= 80 and \
       20 <= row['Temperature'] <= 30:
        return 'Oily'
    
    return 'Combination'


df['Skin_Type'] = df.apply(determine_skin_type_refined, axis=1)


skin_type_counts = df['Skin_Type'].value_counts()
print(skin_type_counts)

Skin_Type
Combination    92783
Oily            3568
Normal          3069
Dry              580
Name: count, dtype: int64


In [5]:
Combination = df[df["Skin_Type"]=="Combination"]
Oily = df[df["Skin_Type"]=="Oily"]
Normal = df[df["Skin_Type"]=="Normal"]
Dry = df[df["Skin_Type"]=="Dry"]

In [8]:
Combination = Combination.head(500)
Oily = Oily.head(500)
Normal = Normal.head(500)
Dry = Dry.head(500)

In [10]:
df = pd.concat([Combination,Oily,Normal,Dry],axis=0,ignore_index=True)

In [12]:
df.describe

<bound method NDFrame.describe of       Age  Gender Hydration_Level Oil_Level Sensitivity  Humidity  \
0      56  Female          Medium      High      Medium      76.3   
1      46    Male          Medium      High        High      41.7   
2      32  Female          Medium      High        High      50.0   
3      25  Female          Medium    Medium        High      53.4   
4      38    Male            High       Low         Low      72.4   
...   ...     ...             ...       ...         ...       ...   
1995   32  Female             Low       Low        High      44.8   
1996   19  Female             Low       Low        High      29.4   
1997   38  Female             Low       Low        High      36.9   
1998   46    Male             Low       Low        High      40.4   
1999   38  Female             Low       Low        High      34.3   

      Temperature    Skin_Type  
0            32.5  Combination  
1            27.8  Combination  
2            23.9  Combination  
3    

In [14]:
df = df.sample(frac=1,random_state=42)

In [16]:
df

Unnamed: 0,Age,Gender,Hydration_Level,Oil_Level,Sensitivity,Humidity,Temperature,Skin_Type
1860,36,Male,Low,Low,High,31.9,10.1,Dry
353,36,Female,High,Low,High,30.1,20.5,Combination
1333,34,Female,High,Medium,Low,53.0,19.5,Normal
905,51,Male,Medium,High,Low,57.3,22.7,Oily
1289,36,Male,High,Medium,Medium,64.4,24.0,Normal
...,...,...,...,...,...,...,...,...
1130,27,Male,Medium,Medium,Medium,63.4,17.3,Normal
1294,59,Male,Medium,Medium,Low,55.9,16.0,Normal
860,35,Female,Medium,High,Low,49.6,26.8,Oily
1459,36,Male,Medium,Medium,Medium,51.0,23.5,Normal


In [18]:
df.to_csv("skin_type_final.csv", index=False)

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df['Hydration_Level'] = label_encoder.fit_transform(df['Hydration_Level'])
df['Oil_Level'] = label_encoder.fit_transform(df['Oil_Level'])
df['Sensitivity'] = label_encoder.fit_transform(df['Sensitivity'])
df['Skin_Type'] = label_encoder.fit_transform(df['Skin_Type'])


X = df.drop('Skin_Type', axis=1)
y = df['Skin_Type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=12)


rf_classifier = RandomForestClassifier(n_estimators=50, random_state=42)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)


print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Confusion Matrix:
[[193   0  10   1]
 [  0 205   0   0]
 [  0   0 193   0]
 [  0   0   0 198]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.95      0.97       204
           1       1.00      1.00      1.00       205
           2       0.95      1.00      0.97       193
           3       0.99      1.00      1.00       198

    accuracy                           0.99       800
   macro avg       0.99      0.99      0.99       800
weighted avg       0.99      0.99      0.99       800

Accuracy Score: 0.98625


In [21]:
from xgboost import XGBClassifier

model = XGBClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

ModuleNotFoundError: No module named 'xgboost'