<a href="https://colab.research.google.com/github/shuler7/DataMiningPublic/blob/main/Attribute_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Opening the Data

In [97]:
import numpy as np
import pandas as pd
import math

df = pd.read_csv("https://github.com/shuler7/DataMiningPublic/raw/main/spaceship_titanic_train.csv")
df_test= pd.read_csv("https://github.com/shuler7/DataMiningPublic/raw/main/spaceship_titanic_test.csv")

df.dropna(inplace=True)

In [98]:
df.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

In [31]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [124]:
# Features
X = df[['HomePlanet', 'CryoSleep', 'Destination', 'Age',  'VIP', 'RoomService',
        'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']]

# Target
y = df['Transported']

# Remove the variables that are not being used as features
df = pd.concat( [X,y], axis=1 )
df.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True


# Function for attribute selection for categorical variables

In [127]:
def Attribute_Selection( df, target):
  features = df.columns[ df.columns != "Transported" ]
  entropies = []
  overall = len( df )

  for attribute in features:
    # Calculate gini of attribute
    entropy = 0
    vals = df[ attribute].unique()

    if len(vals) <=10:
      # The attribute is categorical
      for val in vals:
        subset_size = len(df[ df[attribute] == val ])
        weight = subset_size / overall
        props = df[ df[attribute] == val ][target].value_counts( normalize=True )
        for p in props.array:
          entropy =  entropy - weight*(p*math.log2(p))

      entropies.append( entropy )
    else:
      #The attribute is quantitative
      tempEntropies = []
      vals.sort()
      for val in vals:
        left = df[ df[attribute] <= val ][ [attribute,target] ]
        props = left[ target ].value_counts( normalize = True )
        weight = len( left ) / overall
        for prop in props.array:
          entropy = entropy - weight*prop*math.log2( prop )
        right = df[ df[attribute] > val ][ [attribute,target] ]
        props = right[ target ].value_counts( normalize = True )
        weight = len( right ) / overall
        for prop in props.array:
          entropy = entropy - weight*prop*math.log2( prop )
        tempEntropies.append(entropy)
      entropies.append(tempEntropies[pd.Series( tempEntropies ).idxmin()])

  # Turns into a dataframe
  ginis = pd.DataFrame( {
    'Feature': features.to_list(),
    'Gini': entropies
  })
  return ginis



In [128]:
Attribute_Selection(df, 'Transported')

Unnamed: 0,Feature,Gini
0,HomePlanet,0.972205
1,CryoSleep,0.836178
2,Destination,0.988452
3,Age,0.992782
4,VIP,0.99866
5,RoomService,0.911198
6,FoodCourt,0.963078
7,ShoppingMall,0.948358
8,Spa,0.91056
9,VRDeck,0.92082
