### 0. Import libraries

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings

In [None]:
%matplotlib inline
plt.style.use("ggplot")
plt.rcParams["figure.figsize"] = (8, 8)
warnings.filterwarnings("ignore")

### 1. Load data

In [None]:
df = pd.read_csv("../input/exoplanets-database/kepler.csv")
df.head()

### 2. Exploratory Data Analysis (EDA)

Let's look at the main characteristics of our data: 

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.rename({"# name": "name"}, axis = 1, inplace = True)

1. Take a look at the string columns

In [None]:
df_string = df.select_dtypes(exclude = [np.number])
df_string.columns

2. And take a look at the float columns:

In [None]:
df_float = df.select_dtypes(include = [np.number])
df_float.columns

To solve our problems, we will need the following columns:
1. name
2. mass
3. radius
4. molecules
5. orbital_period
6. detection_type
7. star_mass

In [None]:
dropped_data = df[["name", "mass", "radius", "molecules", "orbital_period", "detection_type", "star_mass"]]
dropped_data.head()

3. We need to find the percentage of missing data:

In [None]:
df_nan = dropped_data.isna().sum() * 100 / len(dropped_data)
df_nan.round(2)

In [None]:
df_nan.round(2).plot.bar(color = ["red", "green", "blue", "yellow", "orange"]);

And take a look at the heatmap:

In [None]:
colors = ["#27f20c", "#f20c0c"]
sns.heatmap(dropped_data.isna(), cbar = False, cmap = colors);

Conclusion: most of the data is missed in the following columns:
1. mass (57.5%)
2. discovered (0.03%)
3. radius (24.28%)
4. molecules (98.71%)
5. orbital_period (3.67%)
6. star_mass (10.13%)

Now you need to process the missing values:

In [None]:
dropped_data["radius"] = dropped_data["radius"].interpolate(method = "linear")
dropped_data["orbital_period"] = dropped_data["orbital_period"].interpolate(method = "linear")
dropped_data["mass"] = dropped_data["mass"].interpolate(method = "linear")
dropped_data["molecules"] = dropped_data["molecules"].replace({np.nan: "N/A"})
dropped_data["star_mass"] = dropped_data["star_mass"].interpolate(method = "linear")

In [None]:
dropped_data.isna().sum()

In [None]:
dropped_data.shape

In [None]:
# delete rows with any missing values
dropped_data.dropna(inplace = True)
dropped_data.shape

In [None]:
dropped_data.head()

### 3.1) How does the mass of the planets correlates with their radius? What does this tells us about the planets' compositions?

In [None]:
corr_data = dropped_data[["mass", "radius"]]

In [None]:
sns.heatmap(corr_data.corr(), square = True, annot = True, cbar = True, cmap = "autumn");

In [None]:
# statistical significance of correlation coefficients
def correlation_pearson(data):
        for i in range(data.shape[1]):
            for j in range(i, data.shape[1]):
                statistics, p_value = stats.pearsonr(data[:, i], data[:, j])
                
                if p_value < 0.05:
                    print((statistics, p_value), "- Statistically significant")
                else:
                    print((statistics, p_value), "- Statistically not significant")

In [None]:
correlation_pearson(corr_data.values)

In [None]:
sns.lmplot(data = corr_data, x = "mass", y = "radius", palette = "husl");

In [None]:
corr_data["mass"] / corr_data["radius"]

As we can see **there is no correlation between the mass and radius** of the discovered exoplanets. 
Let's check the distribution of the mass and radius of the planets:

In [None]:
def plot_histogram_and_qq(data, column_name = "Histogram", distribution_type="norm", normal_tests = True):
    
    mu = np.mean(data)
    sigma = np.std(data)
    
    # Plot histogram of the 1000 points
    plt.figure(figsize=(12,6))
    ax = plt.subplot(1,2,1)
    count, bins, ignored = plt.hist(data, 30, density=True)
    ax.set_title(column_name)
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')

    # Overlay the bell curve (normal distribution) on the bins data
    bell_curve = 1 / (sigma * np.sqrt(2 * np.pi)) * np.exp( - (bins - mu)**2 / (2 * sigma**2))
    plt.plot(bins, bell_curve, linewidth=2, color='r')

    # Q-Q plot
    plt.subplot(1,2,2)
    res = stats.probplot(data, dist=distribution_type, plot=plt)
    plt.show()
    
    print("-"*50)
    print("Min/Max: ", [np.min(data), np.max(data)])
    print("The Three Sigma Rule: ", [mu - 3 * sigma, mu + 3 * sigma])
    print("Mean/Mode/Median: ", [np.mean(data), stats.mode(data), np.median(data)])
    print("Skewness/Excess: ", [stats.skew(data), stats.kurtosis(data)])
    
    if normal_tests and distribution_type == "norm":
        print("-"*50)
        
        statistics, p = stats.normaltest(data)
        
        if p < 0.05:
            print("D'Agostino's test: The sample is not taken from a normal distribution! P-value: {}".format(p))
        else:
            print("D'Agostino's test: The sample is taken from a normal distribution. P-value: {}".format(p))
            
        if data.shape[0] > 2000:
            statistics, p = stats.jarque_bera(data)
            
            if p < 0.05:
                print("Jarque Bera test: The sample is not taken from a normal distribution! P-value: {}".format(p))
            else:
                print("Jarque Bera test: The sample is taken from a normal distribution!. P-value: {}".format(p))
        else:
            statistics, p = stats.shapiro(data)
            
            if p < 0.05:
                 print("Shapiro's test: The sample is not taken from a normal distribution! P-value: {}".format(p))
            else:
                print("Shapiro's test: The sample is taken from a normal distribution!. P-value: {}".format(p))

In [None]:
for column in corr_data.columns:
    plot_histogram_and_qq(corr_data[column], column)

In [None]:
dropped_data["molecules"].value_counts()

We can't use two-factor analysis of variance to understand whether there is a correlation between the mass, radius, and composition of the planet, because there are too few observations in each of the groups and the data do not have a normal distribution. And we can't use non-parametric methods too. Let's look at the schedule try to make a conclusion from it:

In [None]:
temp_data = dropped_data[["mass", "radius", "molecules"]]
temp_data = temp_data[temp_data["molecules"] != "N/A"]
ax = sns.scatterplot(data = temp_data, x = "radius", y = "mass", hue = "molecules");
plt.setp(ax.get_legend().get_texts(), fontsize='5');
plt.setp(ax.get_legend().get_title(), fontsize='6');

As we can see, as the mass of the planet or its radius increases, the composition of the planet changes significantly towards heavier elements.

### 3.2) How does the size of the planet correlates with the orbital period? And what correlations are there with the spectral type of the host stars?

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
orbital_df = dropped_data[["mass", "orbital_period", "star_mass"]]
scaler = StandardScaler()
scaler_df = pd.DataFrame(scaler.fit_transform(orbital_df), columns = orbital_df.columns)

In [None]:
sns.heatmap(scaler_df.corr(), square = True, annot = True, cbar = False, cmap = "autumn");

Let's see if our correlation coefficients are statistically significant:

In [None]:
correlation_pearson(scaler_df.values)

In [None]:
sns.lmplot(data = scaler_df, x = "mass", y = "orbital_period", line_kws={'color': 'blue'});

As we can see, the mass of the planet and its orbital period do not have a direct relationship.

In [None]:
sns.lmplot(data = scaler_df, x = "star_mass", y = "orbital_period");

In [None]:
g = sns.PairGrid(scaler_df)
g.map_diag(sns.distplot)
g.map_offdiag(sns.scatterplot);

Let's build a linear regression on our data to estimate the multiple correlation coefficient ($R^2$):

In [None]:
import statsmodels.api as sm

In [None]:
Y = scaler_df["orbital_period"]
X = scaler_df[["mass", "star_mass"]]
X = sm.add_constant(X)
X, Y

In [None]:
model = sm.OLS(Y, X)
results = model.fit()
results.summary()

The multiple correlation coefficient ($R^2$) is too small, which tells us that the orbital period depends not only on the mass of the planet and the mass of the star (this is indicated by the t-Student criterion and the significance of our regression (F-test))

### 3.3) Which are the best detection methods? What are their limitations? 

In [None]:
dropped_data["detection_type"].value_counts().plot.bar(color = ["red", "green", "blue",
                                                               "yellow", "orange"]);

In [None]:
detection_values = dropped_data["detection_type"].value_counts()
plt.pie(detection_values, labels = detection_values.index, shadow = True, autopct='%.3f');

In [None]:
detection_values * 100 / len(dropped_data)