In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

In [150]:
df = pd.read_csv('forbes_billionaires.csv')
df.head()

Unnamed: 0,Name,NetWorth,Country,Source,Rank,Age,Residence,Citizenship,Status,Children,Education,Self_made
0,Jeff Bezos,177.0,United States,Amazon,1,57.0,"Seattle, Washington",United States,In Relationship,4.0,"Bachelor of Arts/Science, Princeton University",True
1,Elon Musk,151.0,United States,"Tesla, SpaceX",2,49.0,"Austin, Texas",United States,In Relationship,7.0,"Bachelor of Arts/Science, University of Pennsy...",True
2,Bernard Arnault & family,150.0,France,LVMH,3,72.0,"Paris, France",France,Married,5.0,"Bachelor of Arts/Science, Ecole Polytechnique ...",False
3,Bill Gates,124.0,United States,Microsoft,4,65.0,"Medina, Washington",United States,Divorced,3.0,"Drop Out, Harvard University",True
4,Mark Zuckerberg,97.0,United States,Facebook,5,36.0,"Palo Alto, California",United States,Married,2.0,"Drop Out, Harvard University",True


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2755 entries, 0 to 2754
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Name         2755 non-null   object 
 1   NetWorth     2755 non-null   float64
 2   Country      2755 non-null   object 
 3   Source       2755 non-null   object 
 4   Rank         2755 non-null   int64  
 5   Age          2630 non-null   float64
 6   Residence    2715 non-null   object 
 7   Citizenship  2739 non-null   object 
 8   Status       2090 non-null   object 
 9   Children     1552 non-null   float64
 10  Education    1409 non-null   object 
 11  Self_made    2737 non-null   object 
dtypes: float64(3), int64(1), object(8)
memory usage: 258.4+ KB


In [151]:
def rand_fill_na(series):
    boolean_mask_na = pd.isnull(series)   
    nullcount = boolean_mask_na.sum()        

    if nullcount == 0:
        return series            
    
    valuefill = series[~boolean_mask_na].sample(n=nullcount, replace=True, random_state=0)

    valuefill.index = series.index[boolean_mask_na]
    
    return series.fillna(valuefill) 

In [152]:
df["age"] = rand_fill_na(df["Age"])


In [153]:
df.drop('Age', axis = True, inplace=True)


In [97]:
df.Name.value_counts()

Robert Miller             2
Li Li                     2
Wang Yanqing & family     2
Chen Guoxiang & family    1
Tony Chen                 1
                         ..
Frank VanderSloot         1
Kelcy Warren              1
Xu Yi                     1
Vladimir Yevtushenkov     1
Zhou Wei family           1
Name: Name, Length: 2752, dtype: int64

In [14]:
df.Citizenship.value_counts()

United States           718
China                   622
India                   138
Germany                 136
Russia                  118
                       ... 
Venezuela                 1
Eswatini (Swaziland)      1
Algeria                   1
Liechtenstein             1
Nepal                     1
Name: Citizenship, Length: 70, dtype: int64

label encoding data for source and Self_made column

In [154]:
from sklearn.preprocessing import LabelEncoder
l = LabelEncoder()
df['Source'] = l.fit_transform(df['Source'])

In [155]:
df['Self_made'] = l.fit_transform(df['Self_made'])

In [156]:
dummy = pd.get_dummies(df.Citizenship,drop_first=True)

In [102]:
df = pd.concat([df,dummy],axis=1)

In [157]:
df.drop(['Country','Residence','Children','Status','Education','Citizenship','Name'], axis=1,inplace=True)

In [158]:
df.head()

Unnamed: 0,NetWorth,Source,Rank,Self_made,age
0,177.0,6,1,1,57.0
1,151.0,143,2,1,49.0
2,150.0,86,3,0,72.0
3,124.0,96,4,1,65.0
4,97.0,48,5,1,36.0


In [159]:
df = df[df.Self_made.isin([0, 1])]

In [160]:
X = df.loc[:, df.columns != 'NetWorth']


In [161]:
y = df['NetWorth']

In [162]:
df.columns

Index(['NetWorth', 'Source', 'Rank', 'Self_made', 'age'], dtype='object')

In [163]:
from sklearn.preprocessing import StandardScaler 


Scaling the dataset. 

In [167]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 40)

In [168]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(X_train)
x_test = sc.transform(X_test)

In [169]:
from sklearn import metrics

In [170]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [171]:
ypred2 =regressor.predict(X_test)

In [173]:
def adj_r2(x,y):
    r2 = regressor.score(x,y)
    n = x.shape[0]
    p = x.shape[1]
    adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
    return adjusted_r2

In [174]:
adj_r2(X_train,y_train)

0.23905343190307926

In [175]:
adj_r2(X_test,y_test)

0.22084847757248371

In [178]:
from sklearn.metrics import mean_squared_error


In [179]:
print(mean_squared_error(y_test, ypred2, squared = False))


8.79608394948607
