In [84]:
import pandas as pd
import numpy as np
import datetime
import warnings
import pickle
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split

In [85]:
data = pd.read_csv('1806077.csv', nrows=50000)
data.shape

(50000, 19)

In [86]:
data['clear_date']=pd.to_datetime(data.clear_date)
data['due_in_date']=pd.to_datetime(data.due_in_date,format="%Y%m%d")
data['delay']=data['clear_date']-data['due_in_date']

In [87]:
X_test=data[data['clear_date'].isna()]

In [88]:
X_train=data[data['clear_date'].notna()]

In [89]:
#fillna fills the null values present in the given column
X_train['invoice_id'] = X_train['invoice_id'].fillna(X_train['doc_id'])

In [90]:
X_train['posting_date']=pd.to_datetime(X_train.posting_date,format="%Y-%m-%d")
X_train['baseline_create_date']=pd.to_datetime(X_train.baseline_create_date,format="%Y%m%d")
X_train['document_create_date.1']=pd.to_datetime(X_train['document_create_date.1'],format="%Y%m%d")

In [91]:
X_train['delay']=( X_train['delay']/ np.timedelta64 (1, 'D')).astype (float)

In [92]:
X_train_new, X_test_initial, Y_train_new, Y_test_initial = train_test_split(X_train.drop('delay', axis=1), 
                                                    X_train.delay,test_size=0.3,shuffle=False,random_state=1)
print(X_train_new.shape, Y_train_new.shape, X_test_initial.shape, Y_test_initial.shape)

(31948, 19) (31948,) (13692, 19) (13692,)


In [93]:
X_val,X_test_new,Y_val,Y_test_new=train_test_split(X_test_initial, 
                                                    Y_test_initial,test_size=0.5,shuffle=False,random_state=1)
print(X_val.shape,X_test_new.shape,Y_val.shape,Y_test_new.shape)

(6846, 19) (6846, 19) (6846,) (6846,)


In [94]:
mapper = X_train_new.groupby('name_customer')['total_open_amount'].mean().to_dict() 
X_train_new['avg_amt_company'] = X_train_new['name_customer'].map(mapper)

In [95]:
mapper = X_test.groupby('name_customer')['total_open_amount'].mean().to_dict() 
X_test['avg_amt_company'] = X_test['name_customer'].map(mapper)

In [96]:
X_train_new['delay'] = Y_train_new 
target_mapper = X_train_new.groupby('name_customer')['delay'].agg(lambda x:x.value_counts().index[0]).to_dict()
X_train_new['modedelay'] = X_train_new['name_customer'].map(target_mapper)
X_train_new.drop('delay',axis=1,inplace=True)

In [97]:
X_test['modedelay'] = X_test['name_customer'].map(target_mapper)

In [98]:
X_test['modedelay'].fillna(0,inplace=True)

In [99]:
X_test['posting_date']=pd.to_datetime(X_test.posting_date,format="%Y-%m-%d")
X_test['baseline_create_date']=pd.to_datetime(X_test.baseline_create_date,format="%Y%m%d")
X_test['document_create_date.1']=pd.to_datetime(X_test['document_create_date.1'],format="%Y%m%d")

In [100]:
X_test['diff']=X_test['due_in_date']
X_test['diff']=X_test['due_in_date']-X_test['document_create_date.1']
X_test['diff']=( X_test['diff']/ np.timedelta64 (1, 'D')).astype (float)

In [101]:
X_train_new['diff']=X_train_new['due_in_date']
X_train_new['diff']=X_train_new['due_in_date']-X_train_new['document_create_date.1']
X_train_new['diff']=( X_train_new['diff']/ np.timedelta64 (1, 'D')).astype (float)

In [102]:
obj_columns=list(X_test.columns[X_test.dtypes=='object'])
obj_columns
date_columns=list(X_test.columns[X_test.dtypes=='datetime64[ns]'])
date_columns

['clear_date',
 'posting_date',
 'document_create_date.1',
 'due_in_date',
 'baseline_create_date']

In [103]:
X_test.drop(obj_columns,axis=1,inplace=True)
X_test.drop(date_columns,axis=1,inplace=True)

In [104]:
X_train_new.drop(obj_columns,axis=1,inplace=True)
X_train_new.drop(date_columns,axis=1,inplace=True)

In [105]:
X_test.drop(['buisness_year','doc_id','document_create_date','posting_id','area_business','invoice_id'],axis=1,inplace=True)

In [106]:
X_train_new.drop(['buisness_year','doc_id','document_create_date','posting_id','area_business','invoice_id'],axis=1,inplace=True)

In [109]:
X_train_new.head()

Unnamed: 0,total_open_amount,avg_amt_company,modedelay,diff
0,2213.53,21931.645227,-5.0,15.0
1,44919.66,22248.620362,-4.0,15.0
2,7340.26,6626.754265,-8.0,0.0
3,21581.75,34787.456,0.0,40.0
4,283.68,23467.855098,0.0,20.0


In [108]:
X_train_new.drop(['isOpen'],axis=1,inplace=True)

In [110]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [111]:


clf = LinearRegression()
clf.fit(X_train_new, Y_train_new)
predicted = clf.predict(X_train_new)

In [112]:
predicted

array([ -2.59141841,  -1.99300791,  -2.31870373, ...,  -2.65406517,
         1.49314279, -20.45048685])

In [113]:
y=mean_squared_error(Y_train_new, predicted)
y

63.00743454808232

In [114]:
X_test.drop(['isOpen','delay'],axis=1,inplace=True)

In [115]:
X_train_new.head()

Unnamed: 0,total_open_amount,avg_amt_company,modedelay,diff
0,2213.53,21931.645227,-5.0,15.0
1,44919.66,22248.620362,-4.0,15.0
2,7340.26,6626.754265,-8.0,0.0
3,21581.75,34787.456,0.0,40.0
4,283.68,23467.855098,0.0,20.0


In [116]:
X_test.head()

Unnamed: 0,total_open_amount,avg_amt_company,modedelay,diff
45640,4071.12,21566.82625,1.0,15.0
45641,39121.54,24706.9545,1.0,15.0
45642,3289.18,15146.0925,0.0,15.0
45643,78217.96,18501.498585,-5.0,15.0
45644,14565.55,28213.058333,0.0,20.0


In [117]:
clf = LinearRegression()
clf.fit(X_train_new, Y_train_new)
predicted = clf.predict(X_test)
predicted

array([ 2.30381737,  2.12540471,  1.48723061, ..., -3.40231345,
       -2.06718868,  2.06207481])

In [119]:
import pickle

In [121]:
with open('model_pickle','wb') as file:
    pickle.dump(clf,file)