In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.metrics import auc
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score


In [None]:
train = pd.read_csv('../input/tmdb-box-office-prediction/train.csv')
test = pd.read_csv('../input/tmdb-box-office-prediction/test.csv')

# Test on Numeric Columns only

In [None]:
train_num=train.select_dtypes(['int64','float64'])
test_num=test.select_dtypes(['int64','float64'])

## Viewing Numeric Data

In [None]:
train_num.head()

## Extracting the Target Column

In [None]:
y = train_num.revenue        
train_num.drop(['revenue'], axis=1, inplace=True)

In [None]:
my_pipeline = Pipeline(steps=[
    ('preprocessor', SimpleImputer(strategy='mean')),
    ('model', RandomForestRegressor(n_estimators=200, random_state=0))
])


In [None]:
# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(my_pipeline, train_num, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("Average MAE score:", scores.mean())

In [None]:
my_pipeline.fit(train_num,y)

preds=my_pipeline.predict(test_num)

output = pd.DataFrame({'Id': test_num.index,
                       'revenue': preds})
output.to_csv('submission.csv', index=False)