In [21]:
import pandas as pd
import numpy as np
from sklearn import utils

In [23]:
from sklearn.linear_model import LogisticRegression

In [5]:
df = pd.read_csv('balance-scale.data')

In [6]:
df.head()

Unnamed: 0,B,1,1.1,1.2,1.3
0,R,1,1,1,2
1,R,1,1,1,3
2,R,1,1,1,4
3,R,1,1,1,5
4,R,1,1,2,1


In [7]:
df.columns = ['class_name', 'left_weight', 'left_dist', 'right_weight', 'right_dist']

In [8]:
df.head()

Unnamed: 0,class_name,left_weight,left_dist,right_weight,right_dist
0,R,1,1,1,2
1,R,1,1,1,3
2,R,1,1,1,4
3,R,1,1,1,5
4,R,1,1,2,1


In [10]:
df['class_name'].value_counts(100)

L    0.461538
R    0.461538
B    0.076923
Name: class_name, dtype: float64

In [38]:
df['class_name'].value_counts()

L    288
R    288
B     48
Name: class_name, dtype: int64

In [12]:
y = df['class_name']
X = df.drop('class_name', axis=1)

In [13]:
X.head()

Unnamed: 0,left_weight,left_dist,right_weight,right_dist
0,1,1,1,2
1,1,1,1,3
2,1,1,1,4
3,1,1,1,5
4,1,1,2,1


In [14]:
y.head()

0    R
1    R
2    R
3    R
4    R
Name: class_name, dtype: object

In [28]:
model = LogisticRegression().fit(X, y)
model.score(X, y)



0.8782051282051282

In [30]:
np.unique(model.predict(X))
# as we can see model is predicting only Left or Right but not "Balanced Class"
# This is the reason we are geeting high score becuase model never predicts the Blances

array(['L', 'R'], dtype=object)

### Dealing with Imbalance


#### 1st Method: Upsampling

In [41]:
df_majority = df[df['class_name'].isin(['L','R'])]
df_minority = df[df['class_name'] == 'B']
df_min_upsample = utils.resample(df_minority, replace=True, n_samples=288, random_state=42) #majority class size is 288
df_balanced = pd.concat([df_min_upsample, df_majority])

In [42]:
df_balanced['class_name'].value_counts()

L    288
B    288
R    288
Name: class_name, dtype: int64

In [43]:
y = df_balanced['class_name']
X = df_balanced.drop('class_name', axis=1)

In [44]:
model = LogisticRegression()
model.fit(X, y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [46]:
model.score(X, y)

0.9236111111111112

In [48]:
np.unique(model.predict(X))
# The model is predicting all 3 classes now

array(['B', 'L', 'R'], dtype=object)

#### 2nd Method: Class Weight

In [49]:
from sklearn.svm import SVC

In [50]:
y = df['class_name']
X = df.drop('class_name', axis=1)

In [52]:
model_svc = SVC(kernel='linear', class_weight='balanced', probability=True)

In [53]:
model_svc.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [56]:
model_svc.score(X, y)
# Very close to upsampled linear model

0.9166666666666666

In [58]:
np.unique(model.predict(X))
# We can see, the model predicts 3 classes, too

array(['B', 'L', 'R'], dtype=object)