Hi, We will try how to do different methods of feature selection in this kernel. Thanks for your upvotes.

In [None]:
# importing the essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
train_data = pd.read_csv('/kaggle/input/tmdb-box-office-prediction/train.csv')
train_data.head()

In [None]:
# let us see the types of data in the training set
train_data.dtypes

Looks like we have a lot of feature engineering to do. 
1. the columns belongs to collectoins, imdb_id, homepage, original_title, overview, poster_path, realese_date, tagline, title, keywords, cast and crew can be removed (for now). We can deal with it later. 
2. We have features with values of dictionary. So we have to convert the dictionary with some other specific format.

In [None]:
training_data = train_data.drop(['id', 'belongs_to_collection', 'homepage', 'imdb_id', 'original_title',
                                'overview', 'poster_path', 'release_date', 'tagline', 'title',
                                'Keywords', 'cast', 'crew'], axis = 1)
training_data.head()

In [None]:
# checking the missing values
training_data.isnull().sum()

In [None]:
# filling the missing values
training_data = training_data.fillna('0')
training_data.isnull().sum()

We can apply this logic to the dictionary values and convert them into numeric data using Label Encoder. 
I prefer to use Label Encoder instead of one hot encoder (get dummies) because label encoder uses a single feature and adds numbers in it. on the other hand get_dummies will create new columns.

In [None]:
def feature_engineering(series):
    # Feature engineering for genres
    string_list = []
    for i in series:
        string = []
        if (i != '0'):
            o = ast.literal_eval(i)
            for i in o:
                for j in i.items():
                    if (j[0] == 'name'):
                        string.append(j[1])
        string_list.append(' + '.join(string))
    return LabelEncoder().fit_transform(string_list)

In [None]:
# Feature Engineering
training_data.index = train_data['id']
training_data['genres'] = feature_engineering(training_data['genres'])
training_data['production_companies'] = feature_engineering(training_data['production_companies'])
training_data['production_countries'] = feature_engineering(training_data['production_countries'])
training_data['spoken_languages'] = feature_engineering(training_data['spoken_languages'])
training_data['original_language'] = LabelEncoder().fit_transform(training_data['original_language'])
training_data['status'] = LabelEncoder().fit_transform(training_data['status'])

In [None]:
training_data.head()

Now we will visualize few features

In [None]:
sns.heatmap(training_data.corr())

Budget and popularity are the most correlated features with the revenue. Let us see some more plotting (scatter)

In [None]:
plt.plot(training_data['revenue'], training_data['budget'], 'o', label = 'revenue VS budget')
plt.legend()

In [None]:
plt.plot(training_data['revenue'], training_data['popularity'], 'o', label = 'revenue VS popularity')
plt.legend()

Looks like we need to handle outliers too. (I will work on it in the next version)

In [None]:
X = training_data.drop(['revenue'], axis = 1)
y = training_data['revenue']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 101)

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

validations = model.predict(X_val)

print(np.sqrt(mean_squared_error(validations, y_val)))

Now we will import test data

In [None]:
test_data = pd.read_csv('/kaggle/input/tmdb-box-office-prediction/test.csv')
test_data.head()

In [None]:
testing_data = test_data.drop(['id', 'belongs_to_collection', 'homepage', 'imdb_id', 'original_title',
                                'overview', 'poster_path', 'release_date', 'tagline', 'title',
                                'Keywords', 'cast', 'crew'], axis = 1)

In [None]:
# filling the missing values
testing_data = testing_data.fillna('0')
testing_data.isnull().sum()

In [None]:
# Feature Engineering
testing_data.index = test_data['id']
testing_data['genres'] = feature_engineering(testing_data['genres'])
testing_data['production_companies'] = feature_engineering(testing_data['production_companies'])
testing_data['production_countries'] = feature_engineering(testing_data['production_countries'])
testing_data['spoken_languages'] = feature_engineering(testing_data['spoken_languages'])
testing_data['original_language'] = LabelEncoder().fit_transform(testing_data['original_language'])
testing_data['status'] = LabelEncoder().fit_transform(testing_data['status'])

In [None]:
testing_data.head()

In [None]:
predictions = model.predict(testing_data)

In [None]:
# Now creating a dataset and submitting
submission = pd.DataFrame({'id' : test_data['id'], 'revenue' : predictions})
submission.head()

In [None]:
submission.to_csv('submission.csv', index = False)

Thank you for viewing my kernel. Appreciate your time and encouragement. :)