In [1]:
import pandas as pd
import random
from collections import Counter
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.ensemble import RandomForestClassifier

In [2]:
age_1 = np.random.choice(range(20, 25), 1500)
age_2 = np.random.choice(range(25, 46), 5000)
age_3 = np.random.choice(range(46, 66), 3500)

age1_df = pd.DataFrame(age_1, columns=['Age'])
age2_df = pd.DataFrame(age_2, columns=['Age'])
age3_df = pd.DataFrame(age_3, columns=['Age'])

In [3]:
df = pd.concat([age1_df, age2_df, age3_df],  axis=0, ignore_index=True)

# Shuffle data
df = df.sample(frac=1).reset_index(drop=True)

In [4]:
df.shape

(10000, 1)

In [5]:
df.value_counts()

Age
24     314
21     305
23     298
22     295
20     288
37     260
39     255
30     253
27     250
35     250
31     247
41     247
34     241
40     241
38     239
32     238
43     237
28     236
33     233
36     230
25     230
44     227
26     225
45     224
42     221
48     221
29     216
63     199
59     196
60     192
65     191
49     191
64     178
55     176
54     173
57     172
51     167
62     167
47     167
50     166
61     163
52     161
56     160
58     160
46     156
53     144
dtype: int64

In [6]:
df.head()

Unnamed: 0,Age
0,27
1,51
2,43
3,63
4,22


In [7]:
# income
income = []

for x in df['Age']:
  if 19 < x <= 24:
    x = random.randint(15000, 50000)
    income.append(x)
  elif 24 < x <= 44:
    x = random.randint(25000, 100000)
    income.append(x)
  elif 44 < x <= 65:
    x = random.randint(35000, 150000)
    income.append(x)


df['Income'] = income

In [8]:
# expenses

df['Expenses'] = [x*random.randint(50, 100)/100 for x in df['Income']] 

In [9]:
# experience - just keeping it simple 
  

df['Experience'] = df['Age'] -20

In [10]:
# Gender
# 50% male 50% female

male = ["m"]*5000
female = ["f"]*5000

gender = male + female
random.shuffle(gender) 


df['Gender'] =  gender

In [11]:
# Company Rank

rank1 = ["Rank 1"]*2500
rank2 = ["Rank 2"]*3500
rank3 = ["Rank 3"]*4000

rank = rank1 + rank2 + rank3
random.shuffle(rank)

df['Company'] = rank

In [12]:
eligibility = []

for index, row in df.iterrows():
  if (19 < row['Age'] <= 25) & ((row['Income'] - row['Expenses']) >= 10000 ) & (row['Experience'] >= 2) & (row['Company'] == ('Rank 1' or 'Rank 2')):
    eligibility.append('yes')
  elif (25 < row['Age'] <= 45) & ((row['Income'] - row['Expenses']) >= 5000 ) & (row['Experience'] >= 5):
    eligibility.append('yes')
  elif (45 < row['Age'] <= 55) & ((row['Income'] - row['Expenses']) >= 5000 ) & (row['Experience'] >= 25) & (row['Company'] == ('Rank 1' or 'Rank 2')):
    eligibility.append('yes')
  elif (55 < row['Age'] <= 65) & ((row['Income'] - row['Expenses']) >= 5000 ) & (row['Experience'] >= 35) & (row['Company'] == ('Rank 1' or 'Rank 2')):
    eligibility.append('yes')
  else:
    eligibility.append('no')

df['Eligibility'] = eligibility



In [13]:
df.head()

Unnamed: 0,Age,Income,Expenses,Experience,Gender,Company,Eligibility
0,27,92753,56579.33,7,m,Rank 1,yes
1,51,132052,100359.52,31,m,Rank 2,no
2,43,57742,56587.16,23,f,Rank 1,no
3,63,86610,82279.5,43,f,Rank 2,no
4,22,25878,14232.9,2,m,Rank 2,no


In [14]:
df['Eligibility'].value_counts()

no     5176
yes    4824
Name: Eligibility, dtype: int64

In [15]:
df = df.reindex(columns =['Age', 'Gender', 'Income', 'Expenses', 'Company', 'Experience', 'Eligibility'])

In [16]:
df.head()

Unnamed: 0,Age,Gender,Income,Expenses,Company,Experience,Eligibility
0,27,m,92753,56579.33,Rank 1,7,yes
1,51,m,132052,100359.52,Rank 2,31,no
2,43,f,57742,56587.16,Rank 1,23,no
3,63,f,86610,82279.5,Rank 2,43,no
4,22,m,25878,14232.9,Rank 2,2,no


In [17]:
# Data prepared

df.to_csv('Eligibility_data.csv')

In [18]:
# Creating model function

# one-hot-encoding
one_hot_features = ['Gender', 'Eligibility']

# Convert categorical variables into dummy/indicator variables (i.e. one-hot encoding).
one_hot_encoded = pd.get_dummies(df[one_hot_features], drop_first=True)
# Convert Categorical to Numerical for default column
one_hot_encoded.info(verbose=True, memory_usage=True, null_counts=True)

# Replacing categorical columns with dummies
fdf = df.drop(one_hot_features, axis=1)
fdf = pd.concat([fdf, one_hot_encoded], axis=1)

# Label encoding the Company Rank
fdf['Company'] = fdf['Company'].map({'Rank 1':0,'Rank 2':1, 'Rank 3': 2})

# splitting X  &  y
X = fdf.drop('Eligibility_yes', 1)
y = fdf['Eligibility_yes']

# splitting in train & test
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size = 0.3, random_state=10)

# Creating model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

y_pred=rf_model.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   Gender_m         10000 non-null  uint8
 1   Eligibility_yes  10000 non-null  uint8
dtypes: uint8(2)
memory usage: 19.7 KB
[[1468   85]
 [  26 1421]]
0.963
              precision    recall  f1-score   support

           0       0.98      0.95      0.96      1553
           1       0.94      0.98      0.96      1447

    accuracy                           0.96      3000
   macro avg       0.96      0.96      0.96      3000
weighted avg       0.96      0.96      0.96      3000



In [19]:
X_train.head(2)

Unnamed: 0,Age,Income,Expenses,Company,Experience,Gender_m
5663,27,96787,71622.38,1,7,0
2840,58,39320,36567.6,2,38,1


In [20]:
input = {'Age': 31, 'Income': 55000, 'Expenses': 35000, 'Company': 4, 'Experience': 15, 'Gender_m': 1}

input_df = pd.DataFrame(input, index=[0])


def  find_eligibility(input):
  pred = rf_model.predict(input)
  return pred



input_df

Unnamed: 0,Age,Income,Expenses,Company,Experience,Gender_m
0,31,55000,35000,4,15,1


In [21]:
pred = find_eligibility(input_df)
pred[0]

1

In [22]:
# user_input_dict = {'Age': 36, 'Income': 35000, 'Expenses': 25000, 'Company': 3, 'Experience': 13, 'Gender_m': 1}
user_input_dict = zip([36], [35000], [25000], [3], [13], [1])
col = ['Age', 'Income', 'Expenses', 'Company', 'Experience', 'Gender_m']

print("Input_dict: ", user_input_dict)
input_df = pd.DataFrame(list(user_input_dict), columns=col)

print('Input_df: ', input_df)
pred = find_eligibility(input_df)
print("Eligibility_score: ", pred[0])

Input_dict:  <zip object at 0x7f11bb4fba08>
Input_df:     Age  Income  Expenses  Company  Experience  Gender_m
0   36   35000     25000        3          13         1
Eligibility_score:  1


In [23]:
mcap_comp = pd.read_excel('/content/company_name.xlsx')
mcap_comp.head()

FileNotFoundError: ignored

In [None]:
mcap_comp.shape

In [None]:
mcap_comp.columns

In [None]:
dff = mcap_comp.drop(['Sr. No.', 'ISIN', 'BSE Symbol',
       'BSE 6 month Avg Total Market Cap in (Rs. Crs.)', 'NSE Symbol',
       'NSE 6 month Avg Total Market Cap (Rs. Crs.)', 'MSEI Symbol', 
       'MSEI 6 month Avg Total Market Cap in (Rs Crs.)',
       'Average of All Exchanges (Rs. Cr.)'], axis=1)

dff.head()

In [None]:
dff = dff.rename(columns={'Company name': 'Company_name', 'Categorization as per SEBI Circular dated Oct 6, 2017': 'Rank'})

In [None]:
dff['Rank'].value_counts()

In [None]:
rank = []

for x in dff['Rank']:
  if x == 'Large Cap':
    rank.append(1)
  elif x == 'Mid Cap' or x == ' Mid Cap':
    rank.append(2)
  else :
    rank.append(3)


dff['Rank'] = rank

In [None]:
dff.head()

In [None]:
dff['Rank'].value_counts()

In [None]:
dff.to_csv('Company_list.csv')

In [None]:
df_name = pd.read_csv('/content/Indian-Male-Names.csv')

name_1  =  'birender Kamble'
def gender_code(name):
    name.lower()
    name = name.split(' ')
    if any(name[0]) in df_name['name']:
        gen = 1
    else:
        gen = 0
    return gen


gender_code(name_1)

In [None]:
df_name.info()

In [None]:
def eligibility_input(user):
    eli_inputs = []
    for x in user:
        x.lower()
        if "my name is" in x:
            name = x.replace("my name is ", '')
            eli_inputs.append([name])
        elif "date of birth is" in x or "date is" in x:
            dob = [int(s) for s in x.split() if s.isdigit()]
            eli_inputs.append(dob)
        elif "net income" in x:
            netincome = [int(s) for s in x.split() if s.isdigit()]
            eli_inputs.append(netincome)
        elif "total expenses" in x:
            totalexp = [int(s) for s in x.split() if s.isdigit()]
            eli_inputs.append(totalexp)
        elif "company name" in x or "business name" in x:
          if "company name" in x:
            company_name = x.replace('company name ', '')
            eli_inputs.append([company_name])
          else:
            company_name = x.replace('business name ', '')
            eli_inputs.append([company_name])
        elif "total experience" in x:
            exp = [int(s) for s in x.split() if s.isdigit()]
            eli_inputs.append(exp)
    print("Inputs -", eli_inputs)
    return eli_inputs


In [None]:
user = ['hi there, i need a personal loan', 'ok', 'my name is santosh kamble', 
        'date is 14051996', 'net income 45000', 'total expenses 35000', 
        'business name Teleperformamce pvt ltd', 'total experience 15 year']

In [None]:
eligibility_input(user)

In [None]:
df_name = pd.read_csv('/content/Indian-Male-Names.csv')
def gender_code(name):
    name.lower()
    name = name.split(' ')
    if any(name[0]) in df_name['name']:
        gen = 1
    else:
        gen = 0
    return gen

In [None]:
name = 'Ritesh'

gender_code(name)