# Logistic Regression

LogisticRegression is a linear model that is used to model probability of occurrence of certain events, for example probability of success or fail of an event.

cuML’s LogisticRegression can take array-like objects, either in host as NumPy arrays or in device (as Numba or __cuda_array_interface__ compliant), in addition to cuDF objects. It provides both single-class (using sigmoid loss) and multiple-class (using softmax loss) variants, depending on the input variables

# Setup
This file was tested using RAPIDS 0.15 nightly build in Titan RTX GPU

In [1]:
!nvidia-smi

Sat Feb  6 00:34:09 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.102.04   Driver Version: 450.102.04   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:1E.0 Off |                    0 |
| N/A   37C    P0    26W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!nvcc --version

/bin/bash: nvcc: command not found


# Imports

In [3]:
import pandas as pd
import numpy as np
import cudf

# USA employement data has been used for this demo

Data Loading from Amazon S3 bucket

In [4]:
df =cudf.read_csv("https://rapids-keerthi.s3-us-west-1.amazonaws.com/lr.csv")

Print the shape and the first 5 rows

In [5]:
print('data',df.shape)
print(df.head())

data (21721922, 45)
   YEAR  DATANUM  SERIAL CBSERIAL  HHWT  CPI99  GQ  QGQ  PERNUM  PERWT  ...  \
0  1970        2       1     <NA>   100   4.54   1  0.0       1    100  ...   
1  1970        2       1     <NA>   100   4.54   1  0.0       2    100  ...   
2  1970        2       2     <NA>   100   4.54   1  0.0       1    100  ...   
3  1970        2       2     <NA>   100   4.54   1  0.0       2    100  ...   
4  1970        2       4     <NA>   100   4.54   1  0.0       1    100  ...   

   EDUCD_POP EDUCD_SP  EDUCD_MOM2 EDUCD_POP2 INCTOT_HEAD  INCTOT_MOM  \
0       <NA>     30.0        <NA>       <NA>     12450.0        <NA>   
1       <NA>     60.0        <NA>       <NA>     12450.0        <NA>   
2       <NA>     60.0        <NA>       <NA>      9050.0        <NA>   
3       <NA>     70.0        <NA>       <NA>      9050.0        <NA>   
4       <NA>     23.0        <NA>       <NA>      7450.0        <NA>   

  INCTOT_POP INCTOT_SP  INCTOT_MOM2 INCTOT_POP2  
0       <NA>    3450.0

# How many people earn more than 10 Million $ in the year 2010

In [6]:
tdf = df.query('INCTOT >= 1000000 and YEAR==2010')

In [7]:
print("Total number of people = ",tdf['INCTOT'].count())

Total number of people =  566893


# Big Picture

In [8]:
print(df.groupby('YEAR')['INCTOT'].mean())

YEAR
1970    2.677882e+06
1980    2.296695e+06
1990    2.324673e+06
2000    2.188474e+06
2010    1.879017e+06
Name: INCTOT, dtype: float64


In [9]:
print(df.shape)
df.head()

(21721922, 45)


Unnamed: 0,YEAR,DATANUM,SERIAL,CBSERIAL,HHWT,CPI99,GQ,QGQ,PERNUM,PERWT,...,EDUCD_POP,EDUCD_SP,EDUCD_MOM2,EDUCD_POP2,INCTOT_HEAD,INCTOT_MOM,INCTOT_POP,INCTOT_SP,INCTOT_MOM2,INCTOT_POP2
0,1970,2,1,,100,4.54,1,0.0,1,100,...,,30.0,,,12450.0,,,3450.0,,
1,1970,2,1,,100,4.54,1,0.0,2,100,...,,60.0,,,12450.0,,,12450.0,,
2,1970,2,2,,100,4.54,1,0.0,1,100,...,,60.0,,,9050.0,,,0.0,,
3,1970,2,2,,100,4.54,1,0.0,2,100,...,,70.0,,,9050.0,,,9050.0,,
4,1970,2,4,,100,4.54,1,0.0,1,100,...,,23.0,,,7450.0,,,650.0,,


# As of now Single node GPU supports only upto 1.5 M records.
Using only the first 1M records for this demo

In [10]:
df = df.iloc[0:1000000]

# Pre processing 
Using only the columns which are needed

In [11]:

keep_cols = ['YEAR', 'DATANUM', 'SERIAL', 'CBSERIAL', 'HHWT', 'GQ', 'PERNUM', 'SEX', 'AGE', 'INCTOT', 'EDUC', 'EDUCD', 'EDUC_HEAD', 'EDUC_POP', 'EDUC_MOM', 'INCTOT_MOM','INCTOT_POP', 'INCTOT_HEAD', 'SEX_HEAD']
df = df.loc[:, keep_cols]
for i in range(0, len(keep_cols)):
    df[keep_cols[i]] = df[keep_cols[i]].fillna(-1)
    df[keep_cols[i]]= df[keep_cols[i]].astype('float64')


In [12]:
df.head()

Unnamed: 0,YEAR,DATANUM,SERIAL,CBSERIAL,HHWT,GQ,PERNUM,SEX,AGE,INCTOT,EDUC,EDUCD,EDUC_HEAD,EDUC_POP,EDUC_MOM,INCTOT_MOM,INCTOT_POP,INCTOT_HEAD,SEX_HEAD
0,1970.0,2.0,1.0,-1.0,100.0,1.0,1.0,1.0,39.0,12450.0,6.0,60.0,6.0,-1.0,-1.0,-1.0,-1.0,12450.0,1.0
1,1970.0,2.0,1.0,-1.0,100.0,1.0,2.0,2.0,36.0,3450.0,3.0,30.0,6.0,-1.0,-1.0,-1.0,-1.0,12450.0,1.0
2,1970.0,2.0,2.0,-1.0,100.0,1.0,1.0,1.0,56.0,9050.0,7.0,70.0,7.0,-1.0,-1.0,-1.0,-1.0,9050.0,1.0
3,1970.0,2.0,2.0,-1.0,100.0,1.0,2.0,2.0,54.0,0.0,6.0,60.0,7.0,-1.0,-1.0,-1.0,-1.0,9050.0,1.0
4,1970.0,2.0,4.0,-1.0,100.0,1.0,1.0,1.0,82.0,7450.0,1.0,17.0,1.0,-1.0,-1.0,-1.0,-1.0,7450.0,1.0


In [13]:
df['INCTOT'].value_counts()
df['INCTOT'].median()

5450.0

In [14]:
df.head()

Unnamed: 0,YEAR,DATANUM,SERIAL,CBSERIAL,HHWT,GQ,PERNUM,SEX,AGE,INCTOT,EDUC,EDUCD,EDUC_HEAD,EDUC_POP,EDUC_MOM,INCTOT_MOM,INCTOT_POP,INCTOT_HEAD,SEX_HEAD
0,1970.0,2.0,1.0,-1.0,100.0,1.0,1.0,1.0,39.0,12450.0,6.0,60.0,6.0,-1.0,-1.0,-1.0,-1.0,12450.0,1.0
1,1970.0,2.0,1.0,-1.0,100.0,1.0,2.0,2.0,36.0,3450.0,3.0,30.0,6.0,-1.0,-1.0,-1.0,-1.0,12450.0,1.0
2,1970.0,2.0,2.0,-1.0,100.0,1.0,1.0,1.0,56.0,9050.0,7.0,70.0,7.0,-1.0,-1.0,-1.0,-1.0,9050.0,1.0
3,1970.0,2.0,2.0,-1.0,100.0,1.0,2.0,2.0,54.0,0.0,6.0,60.0,7.0,-1.0,-1.0,-1.0,-1.0,9050.0,1.0
4,1970.0,2.0,4.0,-1.0,100.0,1.0,1.0,1.0,82.0,7450.0,1.0,17.0,1.0,-1.0,-1.0,-1.0,-1.0,7450.0,1.0


Categorizing the data as 
 -- >  Low (= 0.0 if total income is less than 10000) as this is data from 1970s
 -->   High(=1.0 if total income is otherwise)
This is done because RAPIDS supports only float data (intergers or strings are not yet supported)

In [15]:
df['income'] = df['INCTOT'].applymap(lambda x: 0.0 if x < 10000 else 1.0)

In [16]:
df['income']

0         1.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
999995    1.0
999996    1.0
999997    0.0
999998    0.0
999999    0.0
Name: income, Length: 1000000, dtype: float64

In [17]:
df.query('income == 1.0')

Unnamed: 0,YEAR,DATANUM,SERIAL,CBSERIAL,HHWT,GQ,PERNUM,SEX,AGE,INCTOT,EDUC,EDUCD,EDUC_HEAD,EDUC_POP,EDUC_MOM,INCTOT_MOM,INCTOT_POP,INCTOT_HEAD,SEX_HEAD,income
0,1970.0,2.0,1.0,-1.0,100.0,1.0,1.0,1.0,39.0,12450.0,6.0,60.0,6.0,-1.0,-1.0,-1.0,-1.0,12450.0,1.0,1.0
9,1970.0,2.0,7.0,-1.0,100.0,1.0,1.0,1.0,25.0,11150.0,6.0,60.0,6.0,-1.0,-1.0,-1.0,-1.0,11150.0,1.0,1.0
11,1970.0,2.0,7.0,-1.0,100.0,1.0,3.0,1.0,1.0,9999999.0,0.0,1.0,6.0,6.0,6.0,4050.0,11150.0,11150.0,1.0,1.0
19,1970.0,2.0,13.0,-1.0,100.0,1.0,1.0,1.0,37.0,16850.0,6.0,65.0,6.0,-1.0,-1.0,-1.0,-1.0,16850.0,1.0,1.0
21,1970.0,2.0,13.0,-1.0,100.0,1.0,3.0,1.0,5.0,9999999.0,0.0,2.0,6.0,6.0,8.0,350.0,16850.0,16850.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999979,1970.0,2.0,364279.0,-1.0,100.0,1.0,1.0,1.0,43.0,22050.0,6.0,60.0,6.0,-1.0,-1.0,-1.0,-1.0,22050.0,1.0,1.0
999982,1970.0,2.0,364279.0,-1.0,100.0,1.0,4.0,1.0,11.0,9999999.0,2.0,23.0,6.0,6.0,6.0,0.0,22050.0,22050.0,1.0,1.0
999984,1970.0,2.0,364280.0,-1.0,100.0,1.0,1.0,1.0,31.0,30150.0,10.0,100.0,10.0,-1.0,-1.0,-1.0,-1.0,30150.0,1.0,1.0
999995,1970.0,2.0,364286.0,-1.0,100.0,1.0,4.0,1.0,13.0,9999999.0,2.0,23.0,6.0,6.0,5.0,0.0,6350.0,6350.0,1.0,1.0


In [18]:
men = df.query('SEX_HEAD ==1.0')

In [19]:
print("Total number of Head of the family who are men = ", men['SEX_HEAD'].count())

Total number of Head of the family who are men =  845054


In [20]:
women = df.query('SEX_HEAD ==2.0')

In [21]:
print("Total number of Head of the family who are men = ", women['SEX_HEAD'].count())

Total number of Head of the family who are men =  127434


# Highest Salary of men vs women

In [22]:
hi_men = df.query('SEX_HEAD ==1.0')

In [23]:
hi_men.groupby('YEAR')['INCTOT'].max()

YEAR
1970.0    9999999.0
Name: INCTOT, dtype: float64

In [24]:
hi_women = df.query('SEX_HEAD ==2.0')

In [25]:
hi_women.groupby('YEAR')['INCTOT'].max()

YEAR
1970.0    9999999.0
Name: INCTOT, dtype: float64

# Highest Edu of men vs women

In [26]:
hi_men.groupby('YEAR')['EDUCD'].max()

YEAR
1970.0    111.0
Name: EDUCD, dtype: float64

In [27]:
hi_women.groupby('YEAR')['EDUCD'].max()

YEAR
1970.0    111.0
Name: EDUCD, dtype: float64

In [28]:
del(men)
del(women)
del(hi_men)
del(hi_women)

# Lets apply Logistic Regression

LogisticRegression is a linear model that is used to model probability of occurrence of certain events, for example probability of success or fail of an event.

cuML’s LogisticRegression can take array-like objects, either in host as NumPy arrays or in device (as Numba or __cuda_array_interface__ compliant), in addition to cuDF objects. It provides both single-class (using sigmoid loss) and multiple-class (using softmax loss) variants, depending on the input variables

In [29]:
from cuml.preprocessing.model_selection import train_test_split
from cuml.linear_model import LogisticRegression
import cuml
import cupy

In [30]:
reg = LogisticRegression(C = 10**-1)

In [31]:
X = df['EDUC']
X.head()

0    6.0
1    3.0
2    7.0
3    6.0
4    1.0
Name: EDUC, dtype: float64

In [32]:
y = df['income']
y.head()

0    1.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: income, dtype: float64

In [33]:
X_df = cudf.DataFrame(X)

In [34]:
X_train, X_test, y_train, y_test = cuml.preprocessing.model_selection.train_test_split(X_df,y, train_size=.75)


Lets see how many low and high income people are there

In [35]:
y_test.value_counts()

0.0    160875
1.0     89125
Name: income, dtype: int32

In [36]:
reg.fit(X_train, y_train)

LogisticRegression(penalty='l2', tol=0.0001, C=0.1, fit_intercept=True, max_iter=1000, linesearch_max_iter=50, verbose=4, l1_ratio=None, solver='qn', handle=<cuml.raft.common.handle.Handle object at 0x7f56e013ee30>, output_type='input')

In [37]:
y_test.head()

531400    1.0
250195    0.0
871135    0.0
784447    0.0
804213    1.0
Name: income, dtype: float64

In [38]:
y_pred = reg.predict(X_test)
y_pred

0         1.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
249995    1.0
249996    1.0
249997    0.0
249998    1.0
249999    0.0
Length: 250000, dtype: float64

# Accuracy of Logistic Regression

In [39]:
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(reg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.84


# Lets print the confusion matrix 


In [40]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test.to_array(), y_pred.to_array())
print(confusion_matrix)

[[155133   5742]
 [ 35259  53866]]
