In [1]:
import pandas as pd
import sqlite3
import pickle
import category_encoders as ce
from sklearn.metrics import f1_score

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


In [2]:
df = pd.read_csv("satellite_info.csv")
df.head(2)

Unnamed: 0,s_no,sat_Name,Discipline,Launch_mass,Launch_date,Launch_vehicle,Launch_site,Periapsis_km,Apoapsis_km,Period_in_minutes,Success
0,1,Aryabhatta,Earth Science Space Science,360.0,1975-04-19,Interkosmos-2,Kapustin Yar,568.0,611.0,96.5,Pass
1,2,Bhaskara Sega-I,Astronomy Communications Engineering Earth Sci...,444.0,1979-06-07,Modified SS-5,Kapustin Yar,512.0,557.0,95.2,Pass


In [3]:
df.dtypes

s_no                   int64
sat_Name              object
Discipline            object
Launch_mass          float64
Launch_date           object
Launch_vehicle        object
Launch_site           object
Periapsis_km         float64
Apoapsis_km          float64
Period_in_minutes    float64
Success               object
dtype: object

In [4]:
print(df.isnull().sum())

s_no                 0
sat_Name             0
Discipline           0
Launch_mass          0
Launch_date          0
Launch_vehicle       0
Launch_site          0
Periapsis_km         0
Apoapsis_km          0
Period_in_minutes    0
Success              0
dtype: int64


In [5]:
df.shape

(139, 11)

In [6]:
for c in df.columns: 
    print(c)
    print(df[c].unique())

s_no
[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
 127 128 129 130 131 132 133 134 135 136 137 138 139]
sat_Name
['Aryabhatta' 'Bhaskara Sega-I' 'Rohini' 'Rohini RS-1' 'Rohini RS-D1'
 'APPLE' 'Bhaskara -II' 'INSAT-1A' 'Rohini RS-D2' 'INSAT-1B' 'SROSS-1'
 'IRS-1A' 'SROSS-2' 'INSAT-1C' 'INSAT-1D' 'IRS-1B' 'INSAT-2DT' 'SROSS-C'
 'INSAT-2A' 'INSAT-2B' 'RS-1E' 'SROSS-C2' 'IRS-P2' 'INSAT-2C' 'IRS-1C'
 'IRS-P3' 'INSAT-2D' 'IRS-1D' 'INSAT-2E' 'OceanSat-1' 'INSAT-3B' 'GSAT-1'
 'TES' 'INSAT-3C' 'Kalpana-1' 'INSAT-3A' 'GSAT-2' 'INSAT-3E'
 

In [7]:
clean_data = df.drop(['s_no', 'sat_Name'], axis=1)
clean_data.columns

Index(['Discipline', 'Launch_mass', 'Launch_date', 'Launch_vehicle',
       'Launch_site', 'Periapsis_km', 'Apoapsis_km', 'Period_in_minutes',
       'Success'],
      dtype='object')

In [8]:
clean_data['Success'] = clean_data['Success'].replace(['Pass', 'pass'], 'Pass')
clean_data.Success.unique()

array(['Pass', 'Fail'], dtype=object)

In [9]:
clean_data.Launch_date = pd.to_datetime(clean_data.Launch_date)
clean_data.Launch_date

0     1975-04-19
1     1979-06-07
2     1979-08-10
3     1980-07-18
4     1981-05-31
         ...    
134   2022-02-14
135   2022-06-22
136   2022-06-30
137   2022-08-07
138   2022-08-07
Name: Launch_date, Length: 139, dtype: datetime64[ns]

In [10]:
def classify_orbit(row):
    if row['Periapsis_km'] <= 2000 and row['Apoapsis_km'] <= 2000:
        return 'LEO'
    elif row['Periapsis_km'] > 2000 and row['Apoapsis_km'] > 2000:
        return 'GEO'
    elif row['Periapsis_km'] > 2000 and row['Apoapsis_km'] <= 20000:
        return 'MEO'
    else:
        return 'Other'

clean_data['Orbit_Type'] = clean_data.apply(classify_orbit, axis=1)
clean_data.Orbit_Type.unique()

array(['LEO', 'GEO', 'Other'], dtype=object)

In [11]:
clean_data['Launch_year'] = clean_data.Launch_date.dt.year
clean_data['Launch_month'] = clean_data.Launch_date.dt.month

In [12]:
def categorize_purpose(row):
    if 'Earth Science' in row['Discipline']:
        return 'Earth Observation'
    elif 'Astronomy' in row['Discipline'] or 'Space Science' in row['Discipline']:
        return 'Space Exploration'
    elif 'Communications' in row['Discipline']:
        return 'Communication'
    elif 'Engineering' in row['Discipline']:
        return 'Engineering'
    else:
        return 'Other'

clean_data['Satellite_Purpose'] = clean_data.apply(categorize_purpose, axis=1)
clean_data.Satellite_Purpose.unique()


array(['Earth Observation', 'Other', 'Communication', 'Space Exploration',
       'Engineering'], dtype=object)

In [13]:
clean_data.columns

Index(['Discipline', 'Launch_mass', 'Launch_date', 'Launch_vehicle',
       'Launch_site', 'Periapsis_km', 'Apoapsis_km', 'Period_in_minutes',
       'Success', 'Orbit_Type', 'Launch_year', 'Launch_month',
       'Satellite_Purpose'],
      dtype='object')

In [14]:
conn = sqlite3.connect('instance/majordb.db')
clean_data.to_sql('satellite', conn, if_exists='replace', index=False)
conn.commit()
conn.close()

fetchign data

In [15]:
conn = sqlite3.connect('instance/majordb.db')

fetched_data = pd.read_sql("SELECT * FROM satellite", conn)
conn.close()
fetched_data.head(2)

Unnamed: 0,Discipline,Launch_mass,Launch_date,Launch_vehicle,Launch_site,Periapsis_km,Apoapsis_km,Period_in_minutes,Success,Orbit_Type,Launch_year,Launch_month,Satellite_Purpose
0,Earth Science Space Science,360.0,1975-04-19 00:00:00,Interkosmos-2,Kapustin Yar,568.0,611.0,96.5,Pass,LEO,1975,4,Earth Observation
1,Astronomy Communications Engineering Earth Sci...,444.0,1979-06-07 00:00:00,Modified SS-5,Kapustin Yar,512.0,557.0,95.2,Pass,LEO,1979,6,Earth Observation


In [16]:
fetched_data.Discipline.unique()

array(['Earth Science Space Science',
       'Astronomy Communications Engineering Earth Sciences',
       'Experimental', 'Earth Sciences', 'Communications',
       'Engineering Earth Sciences', 'Communications Earth Sciences',
       'Astronomy Space Physics',
       'Astronomy Earth Sciences Space Physics',
       'Astronomy Earth Sciences', 'Communications Engineering',
       'Engineering', 'Avionics', 'Planetary Sciences',
       'Earth Sciences Technology Applications',
       'Solar Physics Space Physics', 'Meteorological',
       'Navigation/Global Positioning', 'Planetary Science',
       'Re-entry Experiment', 'Space Sciences', 'Technology Applications',
       'Communications Technology Applications',
       'High-throughput Communication Satellite',
       'Hyperspectral imaging satellite',
       'Communications technology demonstrator',
       'Communications satellite', 'Military satellite',
       'Earth imaging for defense applications', 'Student satellite',
       'H

In [17]:
encoder = ce.OrdinalEncoder(mapping=[{'col': 'Satellite_Purpose', 'mapping': {'Earth Observation': 1, 'Other':2, 'Communication':3, 'Space Exploration':4,'Engineering':5} },
                                     {'col': 'Orbit_Type', 'mapping': {'LEO':1, 'GEO':2, 'Other':3}},
                                     ])
encoder.fit(fetched_data)
fetched_data = encoder.transform(fetched_data)


In [18]:
from sklearn.metrics import f1_score

y = fetched_data['Success']
X = fetched_data.drop(columns=['Success', 'Launch_date', 'Discipline', 'Launch_vehicle', 'Launch_site'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

decision_tree_clf = DecisionTreeClassifier()
random_forest_clf = RandomForestClassifier()
logistic_regression_clf = LogisticRegression()

decision_tree_clf.fit(X_train, y_train)
random_forest_clf.fit(X_train, y_train)
logistic_regression_clf.fit(X_train, y_train)


dt_pred = decision_tree_clf.predict(X_test)
rf_pred = random_forest_clf.predict(X_test)
lr_pred = logistic_regression_clf.predict(X_test)

dt_f1 = f1_score(y_test, dt_pred, average='weighted')
rf_f1 = f1_score(y_test, rf_pred, average='weighted')
lr_f1 = f1_score(y_test, lr_pred, average='weighted')

print("Decision Tree F1 Score:", dt_f1)
print("Random Forest F1 Score:", rf_f1)
print("Logistic Regression F1 Score:", lr_f1)
print(X.columns)


Decision Tree F1 Score: 0.9025210084033614
Random Forest F1 Score: 0.9025210084033614
Logistic Regression F1 Score: 0.876010781671159
Index(['Launch_mass', 'Periapsis_km', 'Apoapsis_km', 'Period_in_minutes',
       'Orbit_Type', 'Launch_year', 'Launch_month', 'Satellite_Purpose'],
      dtype='object')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


decision tree looks good. 
taking it.

In [19]:
# pickle.dump(decision_tree_clf, open('ds2_reg.pkl', 'wb'))

regression

In [20]:
encoder = ce.OrdinalEncoder(mapping=[{'col': 'Success', 'mapping': {'Pass': 1, 'Fail':2} }])
encoder.fit(fetched_data)
fetched_data = encoder.transform(fetched_data)

In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

X = fetched_data.drop(columns=['Launch_mass','Launch_year', 'Launch_month', 'Launch_date', 'Discipline', 'Launch_vehicle', 'Launch_site', 'Success'])
y = fetched_data['Launch_mass']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
linear_y_pred = linear_model.predict(X_test)
linear_rmse = mean_squared_error(y_test, linear_y_pred, squared=False)

# Initialize and train the Random Forest Regression model
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)
rf_rmse = mean_squared_error(y_test, rf_y_pred, squared=False)

# Initialize and train the Gradient Boosting Regression model
gb_model = GradientBoostingRegressor()
gb_model.fit(X_train, y_train)
gb_y_pred = gb_model.predict(X_test)
gb_rmse = mean_squared_error(y_test, gb_y_pred, squared=False)

# Print RMSE for each model
print("Linear Regression RMSE:", linear_rmse)
print("Random Forest Regression RMSE:", rf_rmse)
print("Gradient Boosting Regression RMSE:", gb_rmse)
fetched_data.head(4)




Linear Regression RMSE: 33110.48297984864
Random Forest Regression RMSE: 1233.9841246597027
Gradient Boosting Regression RMSE: 1190.5228350026875




Unnamed: 0,Discipline,Launch_mass,Launch_date,Launch_vehicle,Launch_site,Periapsis_km,Apoapsis_km,Period_in_minutes,Success,Orbit_Type,Launch_year,Launch_month,Satellite_Purpose
0,Earth Science Space Science,360.0,1975-04-19 00:00:00,Interkosmos-2,Kapustin Yar,568.0,611.0,96.5,1,1,1975,4,1
1,Astronomy Communications Engineering Earth Sci...,444.0,1979-06-07 00:00:00,Modified SS-5,Kapustin Yar,512.0,557.0,95.2,1,1,1979,6,1
2,Experimental,35.0,1979-08-10 00:00:00,SLV-3-E1,Satish Dhawan Space Centre Sriharikota,0.0,0.0,0.0,2,1,1979,8,2
3,Earth Sciences,35.0,1980-07-18 00:00:00,SLV-3-E2,Satish Dhawan Space Centre Sriharikota,305.0,919.0,96.9,1,1,1980,7,1


lower the RMSE value, better the model will be. hence choosing rendom forest

In [22]:
pickle.dump(gb_model, open('ds2_reg.pkl', 'wb'))