In [2]:
!pip install scikit-fuzzy

Collecting scikit-fuzzy
  Downloading scikit-fuzzy-0.4.2.tar.gz (993 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m994.0/994.0 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-fuzzy
  Building wheel for scikit-fuzzy (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-fuzzy: filename=scikit_fuzzy-0.4.2-py3-none-any.whl size=894078 sha256=82b2e9c12dda7c83f98a15f1fd65ea0f1e1d95a6e842d8470ccc238ba6794bc0
  Stored in directory: /root/.cache/pip/wheels/4f/86/1b/dfd97134a2c8313e519bcebd95d3fedc7be7944db022094bc8
Successfully built scikit-fuzzy
Installing collected packages: scikit-fuzzy
Successfully installed scikit-fuzzy-0.4.2


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import skfuzzy as fuzz


In [54]:
data = pd.read_csv('HousingData.csv')
print(data.head())

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD  TAX  PTRATIO  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900    1  296     15.3   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671    2  242     17.8   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671    2  242     17.8   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622    3  222     18.7   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622    3  222     18.7   

        B  LSTAT  MEDV  
0  396.90   4.98  24.0  
1  396.90   9.14  21.6  
2  392.83   4.03  34.7  
3  394.63   2.94  33.4  
4  396.90    NaN  36.2  


In [55]:
data = data.drop('CHAS', axis=1) #dropping because categorical

In [56]:
#Taking log transformations
data['MEDV'] = np.log(data['MEDV'] + 1)
data['CRIM'] = np.log(data['CRIM'] + 1)
data['NOX'] = np.log(data['NOX'] + 1)
data['DIS'] = np.log(data['DIS'] + 1)
data['TAX'] = np.log(data['TAX'] + 1)

In [57]:
#Normalising values

scaler = MinMaxScaler()
data[['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']] = scaler.fit_transform(data[['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']])

In [90]:
test = data.sample(n=50)  #test data
data = data.drop(test.index)

In [59]:
def extract_fuzzy_rules_with_values(data ,category_value=3):
    #define the fuzzy membership functions
    if category_value == 3:
        categories = {
            "Low": fuzz.trimf(np.arange(0, 1.01, 0.01), [0, 0, 0.3]),
            "Medium": fuzz.trimf(np.arange(0, 1.01, 0.01), [0.15, 0.3, 0.45]),
            "High": fuzz.trimf(np.arange(0, 1.01, 0.01), [0.3, 0.45, 1])
        }
    elif category_value == 5:
        categories = {
            "Very Low": fuzz.trimf(np.arange(0, 1.01, 0.01), [0, 0, 0.2]),
            "Low": fuzz.trimf(np.arange(0, 1.01, 0.01), [0.1, 0.2, 0.4]),
            "Medium": fuzz.trimf(np.arange(0, 1.01, 0.01), [0.3, 0.4, 0.6]),
            "High": fuzz.trimf(np.arange(0, 1.01, 0.01), [0.5, 0.6, 0.8]),
            "Very High": fuzz.trimf(np.arange(0, 1.01, 0.01), [0.7, 0.8, 1])
    }
    elif category_value == 7:
         categories = {
        "Very Very Low": fuzz.trimf(np.arange(0, 1.01, 0.01), [0, 0, 0.143]),
        "Very Low": fuzz.trimf(np.arange(0, 1.01, 0.01), [0.071, 0.143, 0.286]),
        "Low": fuzz.trimf(np.arange(0, 1.01, 0.01), [0.214, 0.286, 0.429]),
        "Medium": fuzz.trimf(np.arange(0, 1.01, 0.01), [0.357, 0.429, 0.571]),
        "High": fuzz.trimf(np.arange(0, 1.01, 0.01), [0.5, 0.571, 0.714]),
        "Very High": fuzz.trimf(np.arange(0, 1.01, 0.01), [0.643, 0.714, 0.857]),
        "Very Very High": fuzz.trimf(np.arange(0, 1.01, 0.01), [0.786, 0.857, 1])
    }


    #creating a dataframe for rules
    rules_df = pd.DataFrame(columns=list(data.columns) + ['Rule Value'])

    for index, row in data.iterrows():
        rule = []
        rule_value = 1
        for col in data.columns:
            #taken from skfuzzy docs
            memberships = {cat: fuzz.interp_membership(np.arange(0, 1.01, 0.01), mf, row[col]) for cat, mf in categories.items()}
            highest_category = max(memberships, key=memberships.get)
            rule.append(highest_category)
            rule_value *= memberships[highest_category] #assigning value to each rule

        rule.append(rule_value)
        rules_df.loc[index] = rule

    return rules_df

rules_df_three = extract_fuzzy_rules_with_values(data, category_value=3)
rules_df_five = extract_fuzzy_rules_with_values(data, category_value=5)
rules_df_seven = extract_fuzzy_rules_with_values(data, category_value=7)
print(rules_df_five.head())

       CRIM        ZN     INDUS     NOX    RM        AGE     DIS       RAD  \
0  Very Low       Low  Very Low  Medium  High       High  Medium  Very Low   
1  Very Low  Very Low       Low     Low  High  Very High    High  Very Low   
2  Very Low  Very Low       Low     Low  High       High    High  Very Low   
3  Very Low  Very Low  Very Low     Low  High     Medium    High  Very Low   
4  Very Low  Very Low  Very Low     Low  High     Medium    High  Very Low   

        TAX PTRATIO          B     LSTAT       MEDV    Rule Value  
0    Medium     Low  Very High  Very Low       High  4.675572e-18  
1       Low    High  Very High       Low       High  3.632230e-17  
2       Low    High  Very High  Very Low  Very High  2.901690e-03  
3  Very Low    High  Very High  Very Low  Very High  6.234237e-04  
4  Very Low    High  Very High  Very Low  Very High           NaN  


In [60]:
def clean_rules(df):
    df = df.dropna(subset=['Rule Value']) #we have some NaN values

    input_features = df.columns[:-2] #taking all features except the MEDV feature and the rule value
    groups = df.groupby(list(input_features)) #grouping by input feature fuzzy sets
    best_rules = pd.DataFrame()

    for _, group in groups:
        if len(group['MEDV'].unique()) > 1:  #if the input assignments lead to more than one outputs (consequents)
            max_rule = group.loc[group['Rule Value'].idxmax()] #take the rule with the highets rule value
            best_rules = pd.concat([best_rules, pd.DataFrame([max_rule])], ignore_index=True)
        else:
            best_rules = pd.concat([best_rules, group], ignore_index=True)#if thats the only rule

    return best_rules

best_rules_three = clean_rules(rules_df_three)
best_rules_five = clean_rules(rules_df_five)
best_rules_seven = clean_rules(rules_df_seven)

In [87]:
#Confession -> this function was changed after submission at 1:50AM in an attempt to fix the defuzzification process. However, I was unsuccessful in fixing it.
#Wrong implementation
def make_predictions(data, rules_df, category_value=3):
    if category_value == 3:
        medv_categories = {
            "Low": fuzz.trimf(np.arange(0, 1.01, 0.01), [0, 0, 0.3]),
            "Medium": fuzz.trimf(np.arange(0, 1.01, 0.01), [0.15, 0.3, 0.45]),
            "High": fuzz.trimf(np.arange(0, 1.01, 0.01), [0.3, 0.45, 1])
        }
    elif category_value == 5:
        medv_categories = {
            "Very Low": fuzz.trimf(np.arange(0, 1.01, 0.01), [0, 0, 0.2]),
            "Low": fuzz.trimf(np.arange(0, 1.01, 0.01), [0.1, 0.2, 0.4]),
            "Medium": fuzz.trimf(np.arange(0, 1.01, 0.01), [0.3, 0.4, 0.6]),
            "High": fuzz.trimf(np.arange(0, 1.01, 0.01), [0.5, 0.6, 0.8]),
            "Very High": fuzz.trimf(np.arange(0, 1.01, 0.01), [0.7, 0.8, 1])
    }
    elif category_value == 7:
        medv_categories = {
        "Very Very Low": fuzz.trimf(np.arange(0, 1.01, 0.01), [0, 0, 0.143]),
        "Very Low": fuzz.trimf(np.arange(0, 1.01, 0.01), [0.071, 0.143, 0.286]),
        "Low": fuzz.trimf(np.arange(0, 1.01, 0.01), [0.214, 0.286, 0.429]),
        "Medium": fuzz.trimf(np.arange(0, 1.01, 0.01), [0.357, 0.429, 0.571]),
        "High": fuzz.trimf(np.arange(0, 1.01, 0.01), [0.5, 0.571, 0.714]),
        "Very High": fuzz.trimf(np.arange(0, 1.01, 0.01), [0.643, 0.714, 0.857]),
        "Very Very High": fuzz.trimf(np.arange(0, 1.01, 0.01), [0.786, 0.857, 1])
    }

    #taken from skfuzzy docs
    medv_centroids = {
        cat: fuzz.centroid(np.arange(0, 1.01, 0.01), mf) for cat, mf in medv_categories.items()
    }

    predictions = []
    i = 0
    for _, row in data.iterrows():
        i = i+1
        print(f"Processing data: {i}")
        best_match = None
        highest_membership = 0

        for _, rule_row in rules_df.iterrows():
            product_of_memberships = 1
            for col in data.columns[:-1]:  # ignore MEDV and rule value
                cat = rule_row[col] #get the value for feature
                mf = medv_categories[cat]
                membership = fuzz.interp_membership(np.arange(0, 1.01, 0.01), mf, row[col])#get membership value
                product_of_memberships *= membership#because of 'and' operation

            if product_of_memberships > highest_membership:
                highest_membership = product_of_memberships
                best_match = rule_row['MEDV']

        if best_match:
            predicted_medv = medv_centroids[best_match]#use the centroid defuzzification
            predictions.append(predicted_medv)
        else:
            predictions.append(0)

    return predictions

In [88]:
pred=make_predictions(test, rules_df_three)

Processing data: 1
Processing data: 2
Processing data: 3
Processing data: 4
Processing data: 5
Processing data: 6
Processing data: 7
Processing data: 8
Processing data: 9
Processing data: 10
Processing data: 11
Processing data: 12
Processing data: 13
Processing data: 14
Processing data: 15
Processing data: 16
Processing data: 17
Processing data: 18
Processing data: 19
Processing data: 20
Processing data: 21
Processing data: 22
Processing data: 23
Processing data: 24
Processing data: 25
Processing data: 26
Processing data: 27
Processing data: 28
Processing data: 29
Processing data: 30
Processing data: 31
Processing data: 32
Processing data: 33
Processing data: 34
Processing data: 35
Processing data: 36
Processing data: 37
Processing data: 38
Processing data: 39
Processing data: 40
Processing data: 41
Processing data: 42
Processing data: 43
Processing data: 44
Processing data: 45
Processing data: 46
Processing data: 47
Processing data: 48
Processing data: 49
Processing data: 50
Processin

In [89]:
#calculating r2 score
#r2 scores are low because of wrong defuzzification method
from sklearn.metrics import r2_score
medv_predictions = pred
actual_values = test['MEDV'].tolist()
r2 = r2_score(actual_values, medv_predictions)
print("R^2 Score:", r2)

R^2 Score: -16.28132613553719
