In [None]:
import numpy as np
import pandas as pd
import seaborn as sns 
%matplotlib inline

In [None]:
import os
print(os.listdir("../input/dataset1"))

In [None]:
x_train = pd.read_csv('../input/dataset1/X_train1.csv')
y_train = pd.read_csv('../input/dataset1/y_train_categories1.csv')
x_test = pd.read_csv('../input/dataset1/X_test1.csv')

In [None]:
x_train=x_train.drop(x_train.columns[[0]],axis=1)
y_train=y_train.drop(y_train.columns[[0]],axis=1)
building_id=x_test['building_id']
x_test=x_test.drop(x_test.columns[[0,2]],axis=1)
x_test

In [None]:
corr_matrix = x_train.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

In [None]:
x_train

In [None]:
x_train = x_train.drop(columns=to_drop)
x_test = x_test.drop(columns=to_drop)
y_train

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import xgboost

In [None]:
xgb = xgboost.XGBClassifier(n_estimators=1000, learning_rate=0.1, subsample=0.7, colsample_bytree=1)
model = xgb.fit(x_train, y_train, verbose=1000)

In [None]:
train_data = pd.read_csv("../input/train.csv")

In [None]:
train_data.info()

In [None]:
train_data.head(5)

In [None]:
train_data['area_assesed'].unique()

In [None]:
train_data['damage_grade'].unique()

In [None]:
train_data['has_geotechnical_risk'].unique()

In [None]:
len(train_data['vdcmun_id'].unique())

In [None]:
len(train_data['district_id'].unique())

In [None]:
## filling missing values ##
train_data['has_repair_started'].fillna(0,inplace = True)

In [None]:
train_data.info()

In [None]:
## dropping columns which are not useful ##
def drop_features(data,features):
    data.drop(features,inplace = True, axis = 1)

In [None]:
def convert_categorical_to_numerical(data,feature):
    return pd.get_dummies(data[feature], drop_first = True, prefix = feature)

In [None]:
new_col = convert_categorical_to_numerical(train_data,'area_assesed')
train_data = pd.concat([train_data,new_col],axis=1)
drop_features(train_data,'area_assesed')

In [None]:
## deep analysis ##
building_structure = pd.read_csv('../input/Building_Structure.csv')

In [None]:
building_structure.info()

In [None]:
building_structure.dropna(inplace = True)

In [None]:
categorical_values = [col for col in building_structure.columns if building_structure[col].dtype == 'O']
print(categorical_values)

In [None]:
for col in categorical_values[1:]:
    new_col = convert_categorical_to_numerical(building_structure,col)
    building_structure = pd.concat([building_structure,new_col],axis=1)

In [None]:
common_cols = sorted(list(set(train_data.columns).intersection(set(building_structure.columns))))
common_cols

In [None]:
common_cols.pop(0)

In [None]:
common_cols

In [None]:
drop_features(building_structure,categorical_values[1:] + common_cols)

In [None]:
## merging with train data ##

In [None]:
[col for col in building_structure.columns if building_structure[col].dtype == 'O']

In [None]:
train_data = pd.merge(train_data,building_structure,how="left",on=['building_id'])

In [None]:
[col for col in train_data.columns if train_data[col].dtype == 'O']

In [None]:
train_data.info()

In [None]:
## deep analysis on building ownership ##

In [None]:
building_ownership = pd.read_csv('../input/Building_Ownership_Use.csv')

In [None]:
building_ownership.info()

In [None]:
building_ownership['legal_ownership_status'].unique()

In [None]:
new_col = convert_categorical_to_numerical(building_ownership,'legal_ownership_status')
building_ownership = pd.concat([building_ownership,new_col],axis=1)

In [None]:
common_cols = sorted(list(set(train_data.columns).intersection(set(building_ownership.columns))))
common_cols

In [None]:
common_cols.pop(0)

In [None]:
common_cols

In [None]:
drop_features(building_ownership,['legal_ownership_status'] + common_cols)

In [None]:
building_ownership.info()

In [None]:
## merging building_ownership with train data ##

In [None]:
train_data = pd.merge(train_data,building_ownership,how="left",on="building_id")

In [None]:
train_data.info()

In [None]:
## we have one null value in count_families ##
train_data['count_families'].fillna(train_data['count_families'].mode()[0],inplace=True)

In [None]:
## fitting the model with the train data ##
train1=pd.read_csv('')

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier()
from sklearn import datasets
import xgboost as xg
xgb = xg.XGBClassifier()
fit = xgb.fit(train_data, )
fit.feature_importances_

In [None]:
model.fit(train_data.drop(['building_id','damage_grade'],axis = 1),train_data['damage_grade'])

In [None]:
## reading test data ##
test_data = pd.read_csv('../input/test.csv')

In [None]:
test_data.info()

In [None]:
## filling missing values ##
test_data['has_repair_started'].fillna(test_data['has_repair_started'].mode()[0], inplace = True)

In [None]:
area_assesed = convert_categorical_to_numerical(test_data,'area_assesed')
test_data = pd.concat([test_data,area_assesed], axis = 1)

In [None]:
drop_features(test_data,['area_assesed'])

In [None]:
common_cols = list(set(test_data.columns).intersection(set(building_structure.columns)))
common_cols

In [None]:
## merging building structure data with test data ##

In [None]:
test_data = pd.merge(test_data,building_structure,how='left',on='building_id')

In [None]:
test_data.info()

In [None]:
common_cols = list(set(test_data.columns).intersection(set(building_ownership.columns)))
common_cols

In [None]:
## merge building ownership data with test data ##

In [None]:
test_data = pd.merge(test_data,building_ownership,how='left',on='building_id')

In [None]:
test_data.info()

In [None]:
building_id = test_data['building_id']
drop_features(test_data,['building_id'])

In [None]:
test_data

In [None]:
predictions = model.predict(test_data)

In [None]:
print(predictions)

In [None]:
final_result = pd.DataFrame({ 'building_id' : building_id , 'damage_grade' : predictions})
final_result.to_csv('output.csv', index = False)

In [None]:
print(final_result)

In [None]:
print(test_data.info())

In [None]:
test_data1=pd.read_csv('../input/test.csv')

In [None]:
print(test_data1.info())

In [None]:
x= pd.DataFrame()
x['building_id']=test_data1['building_id']
x['district_id']=test_data1['district_id']
#print(x)
x= pd.merge(x,building_structure,how='left',on='building_id')
x= x.drop(x.columns[5:], axis=1)
x['floors_diff']=x['count_floors_pre_eq']-x['count_floors_post_eq']
x['mun_id']=test_data1['vdcmun_id']
x['grade']=final_result['damage_grade']
#print(x)
x

In [None]:
y = x.sort_values(by=['grade','floors_diff'],ascending=False)
y

In [None]:
z = y.sort_values(by=['district_id'],kind='mergesort')
z[:1000]

In [None]:
files,f=[],[]
for i in range(31):
    files.append("file"+str(i+1)+".csv")
print(files)

In [None]:
grp=z.groupby('district_id')
counter=0
for name,group in grp:
    print(name)
    print(group)
    group.to_csv(files[counter], index = False)
    counter+=1
    if counter==31:
        break
        

In [None]:
grading=z.drop(z.columns[1:-1],axis=1)
grading= pd.merge(grading,building_structure,how='left',on='building_id')
grading

In [None]:
grp1=grading.groupby('grade')
counter=0
for name,group in grp1:
    if name=='Grade 5':
        group.to_csv('grade_5.csv', index = False)
    if name=='Grade 1':
        group.to_csv('grade_1.csv', index = False)
    counter+=1
    if counter==5:
        break

In [None]:
test = pd.read_csv('grade_5.csv')
print(test)